petsc-3.14.6 2021-03-30

  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 12:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 15:   n = A->cmap->n;
 16:   m = A->rmap->n;
 17:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
 18:   ns_row = a->inode.size;

 20:   min_mn = (m < n) ? m : n;
 21:   if (!ns) {
 22:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 23:     for (; count+1 < n; count++,i++) ;
 24:     if (count < n)  {
 25:       i++;
 26:     }
 27:     *size = i;
 28:     return(0);
 29:   }
 30:   PetscMalloc1(n+1,&ns_col);

 32:   /* Use the same row structure wherever feasible. */
 33:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 34:     ns_col[i] = ns_row[i];
 35:   }

 37:   /* if m < n; pad up the remainder with inode_limit */
 38:   for (; count+1 < n; count++,i++) {
 39:     ns_col[i] = 1;
 40:   }
 41:   /* The last node is the odd ball. padd it up with the remaining rows; */
 42:   if (count < n) {
 43:     ns_col[i] = n - count;
 44:     i++;
 45:   } else if (count > n) {
 46:     /* Adjust for the over estimation */
 47:     ns_col[i-1] += n - count;
 48:   }
 49:   *size = i;
 50:   *ns   = ns_col;
 51:   return(0);
 52: }


 55: /*
 56:       This builds symmetric version of nonzero structure,
 57: */
 58: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 59: {
 60:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 62:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 63:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 64:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 67:   nslim_row = a->inode.node_count;
 68:   m         = A->rmap->n;
 69:   n         = A->cmap->n;
 70:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 71:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");

 73:   /* Use the row_inode as column_inode */
 74:   nslim_col = nslim_row;
 75:   ns_col    = ns_row;

 77:   /* allocate space for reformated inode structure */
 78:   PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
 79:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 81:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 82:     nsz = ns_col[i1];
 83:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 84:   }
 85:   /* allocate space for row pointers */
 86:   PetscCalloc1(nslim_row+1,&ia);
 87:   *iia = ia;
 88:   PetscMalloc1(nslim_row+1,&work);

 90:   /* determine the number of columns in each row */
 91:   ia[0] = oshift;
 92:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 94:     j    = aj + ai[row] + ishift;
 95:     jmax = aj + ai[row+1] + ishift;
 96:     if (j==jmax) continue; /* empty row */
 97:     col  = *j++ + ishift;
 98:     i2   = tvc[col];
 99:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100:       ia[i1+1]++;
101:       ia[i2+1]++;
102:       i2++;                     /* Start col of next node */
103:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104:       i2 = tvc[col];
105:     }
106:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
107:   }

109:   /* shift ia[i] to point to next row */
110:   for (i1=1; i1<nslim_row+1; i1++) {
111:     row        = ia[i1-1];
112:     ia[i1]    += row;
113:     work[i1-1] = row - oshift;
114:   }

116:   /* allocate space for column pointers */
117:   nz   = ia[nslim_row] + (!ishift);
118:   PetscMalloc1(nz,&ja);
119:   *jja = ja;

121:   /* loop over lower triangular part putting into ja */
122:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123:     j    = aj + ai[row] + ishift;
124:     jmax = aj + ai[row+1] + ishift;
125:     if (j==jmax) continue; /* empty row */
126:     col  = *j++ + ishift;
127:     i2   = tvc[col];
128:     while (i2<i1 && j<jmax) {
129:       ja[work[i2]++] = i1 + oshift;
130:       ja[work[i1]++] = i2 + oshift;
131:       ++i2;
132:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133:       i2 = tvc[col];
134:     }
135:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

137:   }
138:   PetscFree(work);
139:   PetscFree2(tns,tvc);
140:   return(0);
141: }

143: /*
144:       This builds nonsymmetric version of nonzero structure,
145: */
146: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
147: {
148:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
150:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
151:   PetscInt       *tns,*tvc,nsz,i1,i2;
152:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

155:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
156:   nslim_row = a->inode.node_count;
157:   n         = A->cmap->n;

159:   /* Create The column_inode for this matrix */
160:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

162:   /* allocate space for reformated column_inode structure */
163:   PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
164:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

166:   for (i1=0,col=0; i1<nslim_col; ++i1) {
167:     nsz = ns_col[i1];
168:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
169:   }
170:   /* allocate space for row pointers */
171:   PetscCalloc1(nslim_row+1,&ia);
172:   *iia = ia;
173:   PetscMalloc1(nslim_row+1,&work);

175:   /* determine the number of columns in each row */
176:   ia[0] = oshift;
177:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
178:     j   = aj + ai[row] + ishift;
179:     nz  = ai[row+1] - ai[row];
180:     if (!nz) continue; /* empty row */
181:     col = *j++ + ishift;
182:     i2  = tvc[col];
183:     while (nz-- > 0) {           /* off-diagonal elemets */
184:       ia[i1+1]++;
185:       i2++;                     /* Start col of next node */
186:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
187:       if (nz > 0) i2 = tvc[col];
188:     }
189:   }

191:   /* shift ia[i] to point to next row */
192:   for (i1=1; i1<nslim_row+1; i1++) {
193:     row        = ia[i1-1];
194:     ia[i1]    += row;
195:     work[i1-1] = row - oshift;
196:   }

198:   /* allocate space for column pointers */
199:   nz   = ia[nslim_row] + (!ishift);
200:   PetscMalloc1(nz,&ja);
201:   *jja = ja;

203:   /* loop over matrix putting into ja */
204:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
205:     j   = aj + ai[row] + ishift;
206:     nz  = ai[row+1] - ai[row];
207:     if (!nz) continue; /* empty row */
208:     col = *j++ + ishift;
209:     i2  = tvc[col];
210:     while (nz-- > 0) {
211:       ja[work[i1]++] = i2 + oshift;
212:       ++i2;
213:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
214:       if (nz > 0) i2 = tvc[col];
215:     }
216:   }
217:   PetscFree(ns_col);
218:   PetscFree(work);
219:   PetscFree2(tns,tvc);
220:   return(0);
221: }

223: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
224: {
225:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

229:   *n = a->inode.node_count;
230:   if (!ia) return(0);
231:   if (!blockcompressed) {
232:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
233:   } else if (symmetric) {
234:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
235:   } else {
236:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
237:   }
238:   return(0);
239: }

241: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
242: {

246:   if (!ia) return(0);

248:   if (!blockcompressed) {
249:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
250:   } else {
251:     PetscFree(*ia);
252:     PetscFree(*ja);
253:   }
254:   return(0);
255: }

257: /* ----------------------------------------------------------- */

259: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
260: {
261:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
263:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
264:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

267:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
268:   nslim_row = a->inode.node_count;
269:   n         = A->cmap->n;

271:   /* Create The column_inode for this matrix */
272:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

274:   /* allocate space for reformated column_inode structure */
275:   PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
276:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

278:   for (i1=0,col=0; i1<nslim_col; ++i1) {
279:     nsz = ns_col[i1];
280:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
281:   }
282:   /* allocate space for column pointers */
283:   PetscCalloc1(nslim_col+1,&ia);
284:   *iia = ia;
285:   PetscMalloc1(nslim_col+1,&work);

287:   /* determine the number of columns in each row */
288:   ia[0] = oshift;
289:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
290:     j   = aj + ai[row] + ishift;
291:     col = *j++ + ishift;
292:     i2  = tvc[col];
293:     nz  = ai[row+1] - ai[row];
294:     while (nz-- > 0) {           /* off-diagonal elemets */
295:       /* ia[i1+1]++; */
296:       ia[i2+1]++;
297:       i2++;
298:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
299:       if (nz > 0) i2 = tvc[col];
300:     }
301:   }

303:   /* shift ia[i] to point to next col */
304:   for (i1=1; i1<nslim_col+1; i1++) {
305:     col        = ia[i1-1];
306:     ia[i1]    += col;
307:     work[i1-1] = col - oshift;
308:   }

310:   /* allocate space for column pointers */
311:   nz   = ia[nslim_col] + (!ishift);
312:   PetscMalloc1(nz,&ja);
313:   *jja = ja;

315:   /* loop over matrix putting into ja */
316:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
317:     j   = aj + ai[row] + ishift;
318:     col = *j++ + ishift;
319:     i2  = tvc[col];
320:     nz  = ai[row+1] - ai[row];
321:     while (nz-- > 0) {
322:       /* ja[work[i1]++] = i2 + oshift; */
323:       ja[work[i2]++] = i1 + oshift;
324:       i2++;
325:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
326:       if (nz > 0) i2 = tvc[col];
327:     }
328:   }
329:   PetscFree(ns_col);
330:   PetscFree(work);
331:   PetscFree2(tns,tvc);
332:   return(0);
333: }

335: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
336: {

340:   MatCreateColInode_Private(A,n,NULL);
341:   if (!ia) return(0);

343:   if (!blockcompressed) {
344:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
345:   } else if (symmetric) {
346:     /* Since the indices are symmetric it does'nt matter */
347:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
348:   } else {
349:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
350:   }
351:   return(0);
352: }

354: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
355: {

359:   if (!ia) return(0);
360:   if (!blockcompressed) {
361:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
362:   } else {
363:     PetscFree(*ia);
364:     PetscFree(*ja);
365:   }
366:   return(0);
367: }

369: /* ----------------------------------------------------------- */

371: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
372: {
373:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
374:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
375:   PetscScalar       *y;
376:   const PetscScalar *x;
377:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
378:   PetscErrorCode    ierr;
379:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
380:   const PetscInt    *idx,*ns,*ii;

382: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
384: #endif

387:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
388:   node_max = a->inode.node_count;
389:   ns       = a->inode.size;     /* Node Size array */
390:   VecGetArrayRead(xx,&x);
391:   VecGetArray(yy,&y);
392:   idx      = a->j;
393:   v1       = a->a;
394:   ii       = a->i;

396:   for (i = 0,row = 0; i< node_max; ++i) {
397:     nsz         = ns[i];
398:     n           = ii[1] - ii[0];
399:     nonzerorow += (n>0)*nsz;
400:     ii         += nsz;
401:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
402:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
403:     sz = n;                     /* No of non zeros in this row */
404:                                 /* Switch on the size of Node */
405:     switch (nsz) {               /* Each loop in 'case' is unrolled */
406:     case 1:
407:       sum1 = 0.;

409:       for (n = 0; n< sz-1; n+=2) {
410:         i1    = idx[0];         /* The instructions are ordered to */
411:         i2    = idx[1];         /* make the compiler's job easy */
412:         idx  += 2;
413:         tmp0  = x[i1];
414:         tmp1  = x[i2];
415:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
416:       }

418:       if (n == sz-1) {          /* Take care of the last nonzero  */
419:         tmp0  = x[*idx++];
420:         sum1 += *v1++ *tmp0;
421:       }
422:       y[row++]=sum1;
423:       break;
424:     case 2:
425:       sum1 = 0.;
426:       sum2 = 0.;
427:       v2   = v1 + n;

429:       for (n = 0; n< sz-1; n+=2) {
430:         i1    = idx[0];
431:         i2    = idx[1];
432:         idx  += 2;
433:         tmp0  = x[i1];
434:         tmp1  = x[i2];
435:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
437:       }
438:       if (n == sz-1) {
439:         tmp0  = x[*idx++];
440:         sum1 += *v1++ * tmp0;
441:         sum2 += *v2++ * tmp0;
442:       }
443:       y[row++]=sum1;
444:       y[row++]=sum2;
445:       v1      =v2;              /* Since the next block to be processed starts there*/
446:       idx    +=sz;
447:       break;
448:     case 3:
449:       sum1 = 0.;
450:       sum2 = 0.;
451:       sum3 = 0.;
452:       v2   = v1 + n;
453:       v3   = v2 + n;

455:       for (n = 0; n< sz-1; n+=2) {
456:         i1    = idx[0];
457:         i2    = idx[1];
458:         idx  += 2;
459:         tmp0  = x[i1];
460:         tmp1  = x[i2];
461:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
462:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
463:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
464:       }
465:       if (n == sz-1) {
466:         tmp0  = x[*idx++];
467:         sum1 += *v1++ * tmp0;
468:         sum2 += *v2++ * tmp0;
469:         sum3 += *v3++ * tmp0;
470:       }
471:       y[row++]=sum1;
472:       y[row++]=sum2;
473:       y[row++]=sum3;
474:       v1      =v3;              /* Since the next block to be processed starts there*/
475:       idx    +=2*sz;
476:       break;
477:     case 4:
478:       sum1 = 0.;
479:       sum2 = 0.;
480:       sum3 = 0.;
481:       sum4 = 0.;
482:       v2   = v1 + n;
483:       v3   = v2 + n;
484:       v4   = v3 + n;

486:       for (n = 0; n< sz-1; n+=2) {
487:         i1    = idx[0];
488:         i2    = idx[1];
489:         idx  += 2;
490:         tmp0  = x[i1];
491:         tmp1  = x[i2];
492:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
493:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
494:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
495:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
496:       }
497:       if (n == sz-1) {
498:         tmp0  = x[*idx++];
499:         sum1 += *v1++ * tmp0;
500:         sum2 += *v2++ * tmp0;
501:         sum3 += *v3++ * tmp0;
502:         sum4 += *v4++ * tmp0;
503:       }
504:       y[row++]=sum1;
505:       y[row++]=sum2;
506:       y[row++]=sum3;
507:       y[row++]=sum4;
508:       v1      =v4;              /* Since the next block to be processed starts there*/
509:       idx    +=3*sz;
510:       break;
511:     case 5:
512:       sum1 = 0.;
513:       sum2 = 0.;
514:       sum3 = 0.;
515:       sum4 = 0.;
516:       sum5 = 0.;
517:       v2   = v1 + n;
518:       v3   = v2 + n;
519:       v4   = v3 + n;
520:       v5   = v4 + n;

522:       for (n = 0; n<sz-1; n+=2) {
523:         i1    = idx[0];
524:         i2    = idx[1];
525:         idx  += 2;
526:         tmp0  = x[i1];
527:         tmp1  = x[i2];
528:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
529:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
530:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
531:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
532:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
533:       }
534:       if (n == sz-1) {
535:         tmp0  = x[*idx++];
536:         sum1 += *v1++ * tmp0;
537:         sum2 += *v2++ * tmp0;
538:         sum3 += *v3++ * tmp0;
539:         sum4 += *v4++ * tmp0;
540:         sum5 += *v5++ * tmp0;
541:       }
542:       y[row++]=sum1;
543:       y[row++]=sum2;
544:       y[row++]=sum3;
545:       y[row++]=sum4;
546:       y[row++]=sum5;
547:       v1      =v5;       /* Since the next block to be processed starts there */
548:       idx    +=4*sz;
549:       break;
550:     default:
551:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
552:     }
553:   }
554:   VecRestoreArrayRead(xx,&x);
555:   VecRestoreArray(yy,&y);
556:   PetscLogFlops(2.0*a->nz - nonzerorow);
557:   return(0);
558: }
559: /* ----------------------------------------------------------- */
560: /* Almost same code as the MatMult_SeqAIJ_Inode() */
561: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
562: {
563:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
564:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
565:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
566:   const PetscScalar *x;
567:   PetscScalar       *y,*z,*zt;
568:   PetscErrorCode    ierr;
569:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
570:   const PetscInt    *idx,*ns,*ii;

573:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
574:   node_max = a->inode.node_count;
575:   ns       = a->inode.size;     /* Node Size array */

577:   VecGetArrayRead(xx,&x);
578:   VecGetArrayPair(zz,yy,&z,&y);
579:   zt = z;

581:   idx = a->j;
582:   v1  = a->a;
583:   ii  = a->i;

585:   for (i = 0,row = 0; i< node_max; ++i) {
586:     nsz = ns[i];
587:     n   = ii[1] - ii[0];
588:     ii += nsz;
589:     sz  = n;                    /* No of non zeros in this row */
590:                                 /* Switch on the size of Node */
591:     switch (nsz) {               /* Each loop in 'case' is unrolled */
592:     case 1:
593:       sum1 = *zt++;

595:       for (n = 0; n< sz-1; n+=2) {
596:         i1    = idx[0];         /* The instructions are ordered to */
597:         i2    = idx[1];         /* make the compiler's job easy */
598:         idx  += 2;
599:         tmp0  = x[i1];
600:         tmp1  = x[i2];
601:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
602:       }

604:       if (n   == sz-1) {          /* Take care of the last nonzero  */
605:         tmp0  = x[*idx++];
606:         sum1 += *v1++ * tmp0;
607:       }
608:       y[row++]=sum1;
609:       break;
610:     case 2:
611:       sum1 = *zt++;
612:       sum2 = *zt++;
613:       v2   = v1 + n;

615:       for (n = 0; n< sz-1; n+=2) {
616:         i1    = idx[0];
617:         i2    = idx[1];
618:         idx  += 2;
619:         tmp0  = x[i1];
620:         tmp1  = x[i2];
621:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
622:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
623:       }
624:       if (n   == sz-1) {
625:         tmp0  = x[*idx++];
626:         sum1 += *v1++ * tmp0;
627:         sum2 += *v2++ * tmp0;
628:       }
629:       y[row++]=sum1;
630:       y[row++]=sum2;
631:       v1      =v2;              /* Since the next block to be processed starts there*/
632:       idx    +=sz;
633:       break;
634:     case 3:
635:       sum1 = *zt++;
636:       sum2 = *zt++;
637:       sum3 = *zt++;
638:       v2   = v1 + n;
639:       v3   = v2 + n;

641:       for (n = 0; n< sz-1; n+=2) {
642:         i1    = idx[0];
643:         i2    = idx[1];
644:         idx  += 2;
645:         tmp0  = x[i1];
646:         tmp1  = x[i2];
647:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
650:       }
651:       if (n == sz-1) {
652:         tmp0  = x[*idx++];
653:         sum1 += *v1++ * tmp0;
654:         sum2 += *v2++ * tmp0;
655:         sum3 += *v3++ * tmp0;
656:       }
657:       y[row++]=sum1;
658:       y[row++]=sum2;
659:       y[row++]=sum3;
660:       v1      =v3;              /* Since the next block to be processed starts there*/
661:       idx    +=2*sz;
662:       break;
663:     case 4:
664:       sum1 = *zt++;
665:       sum2 = *zt++;
666:       sum3 = *zt++;
667:       sum4 = *zt++;
668:       v2   = v1 + n;
669:       v3   = v2 + n;
670:       v4   = v3 + n;

672:       for (n = 0; n< sz-1; n+=2) {
673:         i1    = idx[0];
674:         i2    = idx[1];
675:         idx  += 2;
676:         tmp0  = x[i1];
677:         tmp1  = x[i2];
678:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
679:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
680:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
681:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
682:       }
683:       if (n == sz-1) {
684:         tmp0  = x[*idx++];
685:         sum1 += *v1++ * tmp0;
686:         sum2 += *v2++ * tmp0;
687:         sum3 += *v3++ * tmp0;
688:         sum4 += *v4++ * tmp0;
689:       }
690:       y[row++]=sum1;
691:       y[row++]=sum2;
692:       y[row++]=sum3;
693:       y[row++]=sum4;
694:       v1      =v4;              /* Since the next block to be processed starts there*/
695:       idx    +=3*sz;
696:       break;
697:     case 5:
698:       sum1 = *zt++;
699:       sum2 = *zt++;
700:       sum3 = *zt++;
701:       sum4 = *zt++;
702:       sum5 = *zt++;
703:       v2   = v1 + n;
704:       v3   = v2 + n;
705:       v4   = v3 + n;
706:       v5   = v4 + n;

708:       for (n = 0; n<sz-1; n+=2) {
709:         i1    = idx[0];
710:         i2    = idx[1];
711:         idx  += 2;
712:         tmp0  = x[i1];
713:         tmp1  = x[i2];
714:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
715:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
716:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
717:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
718:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
719:       }
720:       if (n   == sz-1) {
721:         tmp0  = x[*idx++];
722:         sum1 += *v1++ * tmp0;
723:         sum2 += *v2++ * tmp0;
724:         sum3 += *v3++ * tmp0;
725:         sum4 += *v4++ * tmp0;
726:         sum5 += *v5++ * tmp0;
727:       }
728:       y[row++]=sum1;
729:       y[row++]=sum2;
730:       y[row++]=sum3;
731:       y[row++]=sum4;
732:       y[row++]=sum5;
733:       v1      =v5;       /* Since the next block to be processed starts there */
734:       idx    +=4*sz;
735:       break;
736:     default:
737:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
738:     }
739:   }
740:   VecRestoreArrayRead(xx,&x);
741:   VecRestoreArrayPair(zz,yy,&z,&y);
742:   PetscLogFlops(2.0*a->nz);
743:   return(0);
744: }

746: /* ----------------------------------------------------------- */
747: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
748: {
749:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
750:   IS                iscol = a->col,isrow = a->row;
751:   PetscErrorCode    ierr;
752:   const PetscInt    *r,*c,*rout,*cout;
753:   PetscInt          i,j,n = A->rmap->n,nz;
754:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
755:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
756:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
757:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
758:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
759:   const PetscScalar *b;

762:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
763:   node_max = a->inode.node_count;
764:   ns       = a->inode.size;     /* Node Size array */

766:   VecGetArrayRead(bb,&b);
767:   VecGetArrayWrite(xx,&x);
768:   tmp  = a->solve_work;

770:   ISGetIndices(isrow,&rout); r = rout;
771:   ISGetIndices(iscol,&cout); c = cout + (n-1);

773:   /* forward solve the lower triangular */
774:   tmps = tmp;
775:   aa   = a_a;
776:   aj   = a_j;
777:   ad   = a->diag;

779:   for (i = 0,row = 0; i< node_max; ++i) {
780:     nsz = ns[i];
781:     aii = ai[row];
782:     v1  = aa + aii;
783:     vi  = aj + aii;
784:     nz  = ad[row]- aii;
785:     if (i < node_max-1) {
786:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787:       * but our indexing to determine it's size could. */
788:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
789:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
791:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792:     }

794:     switch (nsz) {               /* Each loop in 'case' is unrolled */
795:     case 1:
796:       sum1 = b[*r++];
797:       for (j=0; j<nz-1; j+=2) {
798:         i0    = vi[0];
799:         i1    = vi[1];
800:         vi   +=2;
801:         tmp0  = tmps[i0];
802:         tmp1  = tmps[i1];
803:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
804:       }
805:       if (j == nz-1) {
806:         tmp0  = tmps[*vi++];
807:         sum1 -= *v1++ *tmp0;
808:       }
809:       tmp[row++]=sum1;
810:       break;
811:     case 2:
812:       sum1 = b[*r++];
813:       sum2 = b[*r++];
814:       v2   = aa + ai[row+1];

816:       for (j=0; j<nz-1; j+=2) {
817:         i0    = vi[0];
818:         i1    = vi[1];
819:         vi   +=2;
820:         tmp0  = tmps[i0];
821:         tmp1  = tmps[i1];
822:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
824:       }
825:       if (j == nz-1) {
826:         tmp0  = tmps[*vi++];
827:         sum1 -= *v1++ *tmp0;
828:         sum2 -= *v2++ *tmp0;
829:       }
830:       sum2     -= *v2++ *sum1;
831:       tmp[row++]=sum1;
832:       tmp[row++]=sum2;
833:       break;
834:     case 3:
835:       sum1 = b[*r++];
836:       sum2 = b[*r++];
837:       sum3 = b[*r++];
838:       v2   = aa + ai[row+1];
839:       v3   = aa + ai[row+2];

841:       for (j=0; j<nz-1; j+=2) {
842:         i0    = vi[0];
843:         i1    = vi[1];
844:         vi   +=2;
845:         tmp0  = tmps[i0];
846:         tmp1  = tmps[i1];
847:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
850:       }
851:       if (j == nz-1) {
852:         tmp0  = tmps[*vi++];
853:         sum1 -= *v1++ *tmp0;
854:         sum2 -= *v2++ *tmp0;
855:         sum3 -= *v3++ *tmp0;
856:       }
857:       sum2 -= *v2++ * sum1;
858:       sum3 -= *v3++ * sum1;
859:       sum3 -= *v3++ * sum2;

861:       tmp[row++]=sum1;
862:       tmp[row++]=sum2;
863:       tmp[row++]=sum3;
864:       break;

866:     case 4:
867:       sum1 = b[*r++];
868:       sum2 = b[*r++];
869:       sum3 = b[*r++];
870:       sum4 = b[*r++];
871:       v2   = aa + ai[row+1];
872:       v3   = aa + ai[row+2];
873:       v4   = aa + ai[row+3];

875:       for (j=0; j<nz-1; j+=2) {
876:         i0    = vi[0];
877:         i1    = vi[1];
878:         vi   +=2;
879:         tmp0  = tmps[i0];
880:         tmp1  = tmps[i1];
881:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
882:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
883:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
884:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
885:       }
886:       if (j == nz-1) {
887:         tmp0  = tmps[*vi++];
888:         sum1 -= *v1++ *tmp0;
889:         sum2 -= *v2++ *tmp0;
890:         sum3 -= *v3++ *tmp0;
891:         sum4 -= *v4++ *tmp0;
892:       }
893:       sum2 -= *v2++ * sum1;
894:       sum3 -= *v3++ * sum1;
895:       sum4 -= *v4++ * sum1;
896:       sum3 -= *v3++ * sum2;
897:       sum4 -= *v4++ * sum2;
898:       sum4 -= *v4++ * sum3;

900:       tmp[row++]=sum1;
901:       tmp[row++]=sum2;
902:       tmp[row++]=sum3;
903:       tmp[row++]=sum4;
904:       break;
905:     case 5:
906:       sum1 = b[*r++];
907:       sum2 = b[*r++];
908:       sum3 = b[*r++];
909:       sum4 = b[*r++];
910:       sum5 = b[*r++];
911:       v2   = aa + ai[row+1];
912:       v3   = aa + ai[row+2];
913:       v4   = aa + ai[row+3];
914:       v5   = aa + ai[row+4];

916:       for (j=0; j<nz-1; j+=2) {
917:         i0    = vi[0];
918:         i1    = vi[1];
919:         vi   +=2;
920:         tmp0  = tmps[i0];
921:         tmp1  = tmps[i1];
922:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
926:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
927:       }
928:       if (j == nz-1) {
929:         tmp0  = tmps[*vi++];
930:         sum1 -= *v1++ *tmp0;
931:         sum2 -= *v2++ *tmp0;
932:         sum3 -= *v3++ *tmp0;
933:         sum4 -= *v4++ *tmp0;
934:         sum5 -= *v5++ *tmp0;
935:       }

937:       sum2 -= *v2++ * sum1;
938:       sum3 -= *v3++ * sum1;
939:       sum4 -= *v4++ * sum1;
940:       sum5 -= *v5++ * sum1;
941:       sum3 -= *v3++ * sum2;
942:       sum4 -= *v4++ * sum2;
943:       sum5 -= *v5++ * sum2;
944:       sum4 -= *v4++ * sum3;
945:       sum5 -= *v5++ * sum3;
946:       sum5 -= *v5++ * sum4;

948:       tmp[row++]=sum1;
949:       tmp[row++]=sum2;
950:       tmp[row++]=sum3;
951:       tmp[row++]=sum4;
952:       tmp[row++]=sum5;
953:       break;
954:     default:
955:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
956:     }
957:   }
958:   /* backward solve the upper triangular */
959:   for (i=node_max -1,row = n-1; i>=0; i--) {
960:     nsz = ns[i];
961:     aii = ai[row+1] -1;
962:     v1  = aa + aii;
963:     vi  = aj + aii;
964:     nz  = aii- ad[row];
965:     switch (nsz) {               /* Each loop in 'case' is unrolled */
966:     case 1:
967:       sum1 = tmp[row];

969:       for (j=nz; j>1; j-=2) {
970:         vi   -=2;
971:         i0    = vi[2];
972:         i1    = vi[1];
973:         tmp0  = tmps[i0];
974:         tmp1  = tmps[i1];
975:         v1   -= 2;
976:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
977:       }
978:       if (j==1) {
979:         tmp0  = tmps[*vi--];
980:         sum1 -= *v1-- * tmp0;
981:       }
982:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
983:       break;
984:     case 2:
985:       sum1 = tmp[row];
986:       sum2 = tmp[row -1];
987:       v2   = aa + ai[row]-1;
988:       for (j=nz; j>1; j-=2) {
989:         vi   -=2;
990:         i0    = vi[2];
991:         i1    = vi[1];
992:         tmp0  = tmps[i0];
993:         tmp1  = tmps[i1];
994:         v1   -= 2;
995:         v2   -= 2;
996:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
998:       }
999:       if (j==1) {
1000:         tmp0  = tmps[*vi--];
1001:         sum1 -= *v1-- * tmp0;
1002:         sum2 -= *v2-- * tmp0;
1003:       }

1005:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006:       sum2   -= *v2-- * tmp0;
1007:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1008:       break;
1009:     case 3:
1010:       sum1 = tmp[row];
1011:       sum2 = tmp[row -1];
1012:       sum3 = tmp[row -2];
1013:       v2   = aa + ai[row]-1;
1014:       v3   = aa + ai[row -1]-1;
1015:       for (j=nz; j>1; j-=2) {
1016:         vi   -=2;
1017:         i0    = vi[2];
1018:         i1    = vi[1];
1019:         tmp0  = tmps[i0];
1020:         tmp1  = tmps[i1];
1021:         v1   -= 2;
1022:         v2   -= 2;
1023:         v3   -= 2;
1024:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1025:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1026:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1027:       }
1028:       if (j==1) {
1029:         tmp0  = tmps[*vi--];
1030:         sum1 -= *v1-- * tmp0;
1031:         sum2 -= *v2-- * tmp0;
1032:         sum3 -= *v3-- * tmp0;
1033:       }
1034:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1035:       sum2   -= *v2-- * tmp0;
1036:       sum3   -= *v3-- * tmp0;
1037:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1038:       sum3   -= *v3-- * tmp0;
1039:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1041:       break;
1042:     case 4:
1043:       sum1 = tmp[row];
1044:       sum2 = tmp[row -1];
1045:       sum3 = tmp[row -2];
1046:       sum4 = tmp[row -3];
1047:       v2   = aa + ai[row]-1;
1048:       v3   = aa + ai[row -1]-1;
1049:       v4   = aa + ai[row -2]-1;

1051:       for (j=nz; j>1; j-=2) {
1052:         vi   -=2;
1053:         i0    = vi[2];
1054:         i1    = vi[1];
1055:         tmp0  = tmps[i0];
1056:         tmp1  = tmps[i1];
1057:         v1   -= 2;
1058:         v2   -= 2;
1059:         v3   -= 2;
1060:         v4   -= 2;
1061:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1065:       }
1066:       if (j==1) {
1067:         tmp0  = tmps[*vi--];
1068:         sum1 -= *v1-- * tmp0;
1069:         sum2 -= *v2-- * tmp0;
1070:         sum3 -= *v3-- * tmp0;
1071:         sum4 -= *v4-- * tmp0;
1072:       }

1074:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1075:       sum2   -= *v2-- * tmp0;
1076:       sum3   -= *v3-- * tmp0;
1077:       sum4   -= *v4-- * tmp0;
1078:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1079:       sum3   -= *v3-- * tmp0;
1080:       sum4   -= *v4-- * tmp0;
1081:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1082:       sum4   -= *v4-- * tmp0;
1083:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1084:       break;
1085:     case 5:
1086:       sum1 = tmp[row];
1087:       sum2 = tmp[row -1];
1088:       sum3 = tmp[row -2];
1089:       sum4 = tmp[row -3];
1090:       sum5 = tmp[row -4];
1091:       v2   = aa + ai[row]-1;
1092:       v3   = aa + ai[row -1]-1;
1093:       v4   = aa + ai[row -2]-1;
1094:       v5   = aa + ai[row -3]-1;
1095:       for (j=nz; j>1; j-=2) {
1096:         vi   -= 2;
1097:         i0    = vi[2];
1098:         i1    = vi[1];
1099:         tmp0  = tmps[i0];
1100:         tmp1  = tmps[i1];
1101:         v1   -= 2;
1102:         v2   -= 2;
1103:         v3   -= 2;
1104:         v4   -= 2;
1105:         v5   -= 2;
1106:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1107:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1108:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1109:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1110:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1111:       }
1112:       if (j==1) {
1113:         tmp0  = tmps[*vi--];
1114:         sum1 -= *v1-- * tmp0;
1115:         sum2 -= *v2-- * tmp0;
1116:         sum3 -= *v3-- * tmp0;
1117:         sum4 -= *v4-- * tmp0;
1118:         sum5 -= *v5-- * tmp0;
1119:       }

1121:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1122:       sum2   -= *v2-- * tmp0;
1123:       sum3   -= *v3-- * tmp0;
1124:       sum4   -= *v4-- * tmp0;
1125:       sum5   -= *v5-- * tmp0;
1126:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1127:       sum3   -= *v3-- * tmp0;
1128:       sum4   -= *v4-- * tmp0;
1129:       sum5   -= *v5-- * tmp0;
1130:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1131:       sum4   -= *v4-- * tmp0;
1132:       sum5   -= *v5-- * tmp0;
1133:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1134:       sum5   -= *v5-- * tmp0;
1135:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1136:       break;
1137:     default:
1138:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1139:     }
1140:   }
1141:   ISRestoreIndices(isrow,&rout);
1142:   ISRestoreIndices(iscol,&cout);
1143:   VecRestoreArrayRead(bb,&b);
1144:   VecRestoreArrayWrite(xx,&x);
1145:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1146:   return(0);
1147: }

1149: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1150: {
1151:   Mat             C     =B;
1152:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1153:   IS              isrow = b->row,isicol = b->icol;
1154:   PetscErrorCode  ierr;
1155:   const PetscInt  *r,*ic,*ics;
1156:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1157:   PetscInt        i,j,k,nz,nzL,row,*pj;
1158:   const PetscInt  *ajtmp,*bjtmp;
1159:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1160:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1161:   FactorShiftCtx  sctx;
1162:   const PetscInt  *ddiag;
1163:   PetscReal       rs;
1164:   MatScalar       d;
1165:   PetscInt        inod,nodesz,node_max,col;
1166:   const PetscInt  *ns;
1167:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1170:   /* MatPivotSetUp(): initialize shift context sctx */
1171:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1173:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1174:     ddiag          = a->diag;
1175:     sctx.shift_top = info->zeropivot;
1176:     for (i=0; i<n; i++) {
1177:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1178:       d  = (aa)[ddiag[i]];
1179:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1180:       v  = aa+ai[i];
1181:       nz = ai[i+1] - ai[i];
1182:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1183:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1184:     }
1185:     sctx.shift_top *= 1.1;
1186:     sctx.nshift_max = 5;
1187:     sctx.shift_lo   = 0.;
1188:     sctx.shift_hi   = 1.;
1189:   }

1191:   ISGetIndices(isrow,&r);
1192:   ISGetIndices(isicol,&ic);

1194:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1195:   ics   = ic;

1197:   node_max = a->inode.node_count;
1198:   ns       = a->inode.size;
1199:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1201:   /* If max inode size > 4, split it into two inodes.*/
1202:   /* also map the inode sizes according to the ordering */
1203:   PetscMalloc1(n+1,&tmp_vec1);
1204:   for (i=0,j=0; i<node_max; ++i,++j) {
1205:     if (ns[i] > 4) {
1206:       tmp_vec1[j] = 4;
1207:       ++j;
1208:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1209:     } else {
1210:       tmp_vec1[j] = ns[i];
1211:     }
1212:   }
1213:   /* Use the correct node_max */
1214:   node_max = j;

1216:   /* Now reorder the inode info based on mat re-ordering info */
1217:   /* First create a row -> inode_size_array_index map */
1218:   PetscMalloc1(n+1,&nsmap);
1219:   PetscMalloc1(node_max+1,&tmp_vec2);
1220:   for (i=0,row=0; i<node_max; i++) {
1221:     nodesz = tmp_vec1[i];
1222:     for (j=0; j<nodesz; j++,row++) {
1223:       nsmap[row] = i;
1224:     }
1225:   }
1226:   /* Using nsmap, create a reordered ns structure */
1227:   for (i=0,j=0; i< node_max; i++) {
1228:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1229:     tmp_vec2[i] = nodesz;
1230:     j          += nodesz;
1231:   }
1232:   PetscFree(nsmap);
1233:   PetscFree(tmp_vec1);

1235:   /* Now use the correct ns */
1236:   ns = tmp_vec2;

1238:   do {
1239:     sctx.newshift = PETSC_FALSE;
1240:     /* Now loop over each block-row, and do the factorization */
1241:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1242:       nodesz = ns[inod];

1244:       switch (nodesz) {
1245:       case 1:
1246:         /*----------*/
1247:         /* zero rtmp1 */
1248:         /* L part */
1249:         nz    = bi[i+1] - bi[i];
1250:         bjtmp = bj + bi[i];
1251:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1253:         /* U part */
1254:         nz    = bdiag[i]-bdiag[i+1];
1255:         bjtmp = bj + bdiag[i+1]+1;
1256:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1258:         /* load in initial (unfactored row) */
1259:         nz    = ai[r[i]+1] - ai[r[i]];
1260:         ajtmp = aj + ai[r[i]];
1261:         v     = aa + ai[r[i]];
1262:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1264:         /* ZeropivotApply() */
1265:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1267:         /* elimination */
1268:         bjtmp = bj + bi[i];
1269:         row   = *bjtmp++;
1270:         nzL   = bi[i+1] - bi[i];
1271:         for (k=0; k < nzL; k++) {
1272:           pc = rtmp1 + row;
1273:           if (*pc != 0.0) {
1274:             pv   = b->a + bdiag[row];
1275:             mul1 = *pc * (*pv);
1276:             *pc  = mul1;
1277:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1278:             pv   = b->a + bdiag[row+1]+1;
1279:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1280:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1281:             PetscLogFlops(1+2.0*nz);
1282:           }
1283:           row = *bjtmp++;
1284:         }

1286:         /* finished row so stick it into b->a */
1287:         rs = 0.0;
1288:         /* L part */
1289:         pv = b->a + bi[i];
1290:         pj = b->j + bi[i];
1291:         nz = bi[i+1] - bi[i];
1292:         for (j=0; j<nz; j++) {
1293:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1294:         }

1296:         /* U part */
1297:         pv = b->a + bdiag[i+1]+1;
1298:         pj = b->j + bdiag[i+1]+1;
1299:         nz = bdiag[i] - bdiag[i+1]-1;
1300:         for (j=0; j<nz; j++) {
1301:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1302:         }

1304:         /* Check zero pivot */
1305:         sctx.rs = rs;
1306:         sctx.pv = rtmp1[i];
1307:         MatPivotCheck(B,A,info,&sctx,i);
1308:         if (sctx.newshift) break;

1310:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1311:         pv  = b->a + bdiag[i];
1312:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1313:         break;

1315:       case 2:
1316:         /*----------*/
1317:         /* zero rtmp1 and rtmp2 */
1318:         /* L part */
1319:         nz    = bi[i+1] - bi[i];
1320:         bjtmp = bj + bi[i];
1321:         for  (j=0; j<nz; j++) {
1322:           col        = bjtmp[j];
1323:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1324:         }

1326:         /* U part */
1327:         nz    = bdiag[i]-bdiag[i+1];
1328:         bjtmp = bj + bdiag[i+1]+1;
1329:         for  (j=0; j<nz; j++) {
1330:           col        = bjtmp[j];
1331:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1332:         }

1334:         /* load in initial (unfactored row) */
1335:         nz    = ai[r[i]+1] - ai[r[i]];
1336:         ajtmp = aj + ai[r[i]];
1337:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1338:         for (j=0; j<nz; j++) {
1339:           col        = ics[ajtmp[j]];
1340:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1341:         }
1342:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1343:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1345:         /* elimination */
1346:         bjtmp = bj + bi[i];
1347:         row   = *bjtmp++; /* pivot row */
1348:         nzL   = bi[i+1] - bi[i];
1349:         for (k=0; k < nzL; k++) {
1350:           pc1 = rtmp1 + row;
1351:           pc2 = rtmp2 + row;
1352:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1353:             pv   = b->a + bdiag[row];
1354:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1355:             *pc1 = mul1;       *pc2 = mul2;

1357:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1358:             pv = b->a + bdiag[row+1]+1;
1359:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1360:             for (j=0; j<nz; j++) {
1361:               col         = pj[j];
1362:               rtmp1[col] -= mul1 * pv[j];
1363:               rtmp2[col] -= mul2 * pv[j];
1364:             }
1365:             PetscLogFlops(2+4.0*nz);
1366:           }
1367:           row = *bjtmp++;
1368:         }

1370:         /* finished row i; check zero pivot, then stick row i into b->a */
1371:         rs = 0.0;
1372:         /* L part */
1373:         pc1 = b->a + bi[i];
1374:         pj  = b->j + bi[i];
1375:         nz  = bi[i+1] - bi[i];
1376:         for (j=0; j<nz; j++) {
1377:           col    = pj[j];
1378:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1379:         }
1380:         /* U part */
1381:         pc1 = b->a + bdiag[i+1]+1;
1382:         pj  = b->j + bdiag[i+1]+1;
1383:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1384:         for (j=0; j<nz; j++) {
1385:           col    = pj[j];
1386:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1387:         }

1389:         sctx.rs = rs;
1390:         sctx.pv = rtmp1[i];
1391:         MatPivotCheck(B,A,info,&sctx,i);
1392:         if (sctx.newshift) break;
1393:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1394:         *pc1 = 1.0/sctx.pv;

1396:         /* Now take care of diagonal 2x2 block. */
1397:         pc2 = rtmp2 + i;
1398:         if (*pc2 != 0.0) {
1399:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1400:           *pc2 = mul1;          /* insert L entry */
1401:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1402:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1403:           for (j=0; j<nz; j++) {
1404:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1405:           }
1406:           PetscLogFlops(1+2.0*nz);
1407:         }

1409:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1410:         rs = 0.0;
1411:         /* L part */
1412:         pc2 = b->a + bi[i+1];
1413:         pj  = b->j + bi[i+1];
1414:         nz  = bi[i+2] - bi[i+1];
1415:         for (j=0; j<nz; j++) {
1416:           col    = pj[j];
1417:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1418:         }
1419:         /* U part */
1420:         pc2 = b->a + bdiag[i+2]+1;
1421:         pj  = b->j + bdiag[i+2]+1;
1422:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1423:         for (j=0; j<nz; j++) {
1424:           col    = pj[j];
1425:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1426:         }

1428:         sctx.rs = rs;
1429:         sctx.pv = rtmp2[i+1];
1430:         MatPivotCheck(B,A,info,&sctx,i+1);
1431:         if (sctx.newshift) break;
1432:         pc2  = b->a + bdiag[i+1];
1433:         *pc2 = 1.0/sctx.pv;
1434:         break;

1436:       case 3:
1437:         /*----------*/
1438:         /* zero rtmp */
1439:         /* L part */
1440:         nz    = bi[i+1] - bi[i];
1441:         bjtmp = bj + bi[i];
1442:         for  (j=0; j<nz; j++) {
1443:           col        = bjtmp[j];
1444:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1445:         }

1447:         /* U part */
1448:         nz    = bdiag[i]-bdiag[i+1];
1449:         bjtmp = bj + bdiag[i+1]+1;
1450:         for  (j=0; j<nz; j++) {
1451:           col        = bjtmp[j];
1452:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1453:         }

1455:         /* load in initial (unfactored row) */
1456:         nz    = ai[r[i]+1] - ai[r[i]];
1457:         ajtmp = aj + ai[r[i]];
1458:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1459:         for (j=0; j<nz; j++) {
1460:           col        = ics[ajtmp[j]];
1461:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1462:         }
1463:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1464:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1466:         /* elimination */
1467:         bjtmp = bj + bi[i];
1468:         row   = *bjtmp++; /* pivot row */
1469:         nzL   = bi[i+1] - bi[i];
1470:         for (k=0; k < nzL; k++) {
1471:           pc1 = rtmp1 + row;
1472:           pc2 = rtmp2 + row;
1473:           pc3 = rtmp3 + row;
1474:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1475:             pv   = b->a + bdiag[row];
1476:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1477:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1479:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1480:             pv = b->a + bdiag[row+1]+1;
1481:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1482:             for (j=0; j<nz; j++) {
1483:               col         = pj[j];
1484:               rtmp1[col] -= mul1 * pv[j];
1485:               rtmp2[col] -= mul2 * pv[j];
1486:               rtmp3[col] -= mul3 * pv[j];
1487:             }
1488:             PetscLogFlops(3+6.0*nz);
1489:           }
1490:           row = *bjtmp++;
1491:         }

1493:         /* finished row i; check zero pivot, then stick row i into b->a */
1494:         rs = 0.0;
1495:         /* L part */
1496:         pc1 = b->a + bi[i];
1497:         pj  = b->j + bi[i];
1498:         nz  = bi[i+1] - bi[i];
1499:         for (j=0; j<nz; j++) {
1500:           col    = pj[j];
1501:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1502:         }
1503:         /* U part */
1504:         pc1 = b->a + bdiag[i+1]+1;
1505:         pj  = b->j + bdiag[i+1]+1;
1506:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1507:         for (j=0; j<nz; j++) {
1508:           col    = pj[j];
1509:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1510:         }

1512:         sctx.rs = rs;
1513:         sctx.pv = rtmp1[i];
1514:         MatPivotCheck(B,A,info,&sctx,i);
1515:         if (sctx.newshift) break;
1516:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1517:         *pc1 = 1.0/sctx.pv;

1519:         /* Now take care of 1st column of diagonal 3x3 block. */
1520:         pc2 = rtmp2 + i;
1521:         pc3 = rtmp3 + i;
1522:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1523:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1524:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1525:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1526:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1527:           for (j=0; j<nz; j++) {
1528:             col         = pj[j];
1529:             rtmp2[col] -= mul2 * rtmp1[col];
1530:             rtmp3[col] -= mul3 * rtmp1[col];
1531:           }
1532:           PetscLogFlops(2+4.0*nz);
1533:         }

1535:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1536:         rs = 0.0;
1537:         /* L part */
1538:         pc2 = b->a + bi[i+1];
1539:         pj  = b->j + bi[i+1];
1540:         nz  = bi[i+2] - bi[i+1];
1541:         for (j=0; j<nz; j++) {
1542:           col    = pj[j];
1543:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1544:         }
1545:         /* U part */
1546:         pc2 = b->a + bdiag[i+2]+1;
1547:         pj  = b->j + bdiag[i+2]+1;
1548:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1549:         for (j=0; j<nz; j++) {
1550:           col    = pj[j];
1551:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1552:         }

1554:         sctx.rs = rs;
1555:         sctx.pv = rtmp2[i+1];
1556:         MatPivotCheck(B,A,info,&sctx,i+1);
1557:         if (sctx.newshift) break;
1558:         pc2  = b->a + bdiag[i+1];
1559:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1561:         /* Now take care of 2nd column of diagonal 3x3 block. */
1562:         pc3 = rtmp3 + i+1;
1563:         if (*pc3 != 0.0) {
1564:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1565:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1566:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1567:           for (j=0; j<nz; j++) {
1568:             col         = pj[j];
1569:             rtmp3[col] -= mul3 * rtmp2[col];
1570:           }
1571:           PetscLogFlops(1+2.0*nz);
1572:         }

1574:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1575:         rs = 0.0;
1576:         /* L part */
1577:         pc3 = b->a + bi[i+2];
1578:         pj  = b->j + bi[i+2];
1579:         nz  = bi[i+3] - bi[i+2];
1580:         for (j=0; j<nz; j++) {
1581:           col    = pj[j];
1582:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1583:         }
1584:         /* U part */
1585:         pc3 = b->a + bdiag[i+3]+1;
1586:         pj  = b->j + bdiag[i+3]+1;
1587:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1588:         for (j=0; j<nz; j++) {
1589:           col    = pj[j];
1590:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1591:         }

1593:         sctx.rs = rs;
1594:         sctx.pv = rtmp3[i+2];
1595:         MatPivotCheck(B,A,info,&sctx,i+2);
1596:         if (sctx.newshift) break;
1597:         pc3  = b->a + bdiag[i+2];
1598:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1599:         break;
1600:       case 4:
1601:         /*----------*/
1602:         /* zero rtmp */
1603:         /* L part */
1604:         nz    = bi[i+1] - bi[i];
1605:         bjtmp = bj + bi[i];
1606:         for  (j=0; j<nz; j++) {
1607:           col        = bjtmp[j];
1608:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1609:         }

1611:         /* U part */
1612:         nz    = bdiag[i]-bdiag[i+1];
1613:         bjtmp = bj + bdiag[i+1]+1;
1614:         for  (j=0; j<nz; j++) {
1615:           col        = bjtmp[j];
1616:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1617:         }

1619:         /* load in initial (unfactored row) */
1620:         nz    = ai[r[i]+1] - ai[r[i]];
1621:         ajtmp = aj + ai[r[i]];
1622:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1623:         for (j=0; j<nz; j++) {
1624:           col        = ics[ajtmp[j]];
1625:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1626:         }
1627:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1628:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1630:         /* elimination */
1631:         bjtmp = bj + bi[i];
1632:         row   = *bjtmp++; /* pivot row */
1633:         nzL   = bi[i+1] - bi[i];
1634:         for (k=0; k < nzL; k++) {
1635:           pc1 = rtmp1 + row;
1636:           pc2 = rtmp2 + row;
1637:           pc3 = rtmp3 + row;
1638:           pc4 = rtmp4 + row;
1639:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1640:             pv   = b->a + bdiag[row];
1641:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1642:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1644:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1645:             pv = b->a + bdiag[row+1]+1;
1646:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1647:             for (j=0; j<nz; j++) {
1648:               col         = pj[j];
1649:               rtmp1[col] -= mul1 * pv[j];
1650:               rtmp2[col] -= mul2 * pv[j];
1651:               rtmp3[col] -= mul3 * pv[j];
1652:               rtmp4[col] -= mul4 * pv[j];
1653:             }
1654:             PetscLogFlops(4+8.0*nz);
1655:           }
1656:           row = *bjtmp++;
1657:         }

1659:         /* finished row i; check zero pivot, then stick row i into b->a */
1660:         rs = 0.0;
1661:         /* L part */
1662:         pc1 = b->a + bi[i];
1663:         pj  = b->j + bi[i];
1664:         nz  = bi[i+1] - bi[i];
1665:         for (j=0; j<nz; j++) {
1666:           col    = pj[j];
1667:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1668:         }
1669:         /* U part */
1670:         pc1 = b->a + bdiag[i+1]+1;
1671:         pj  = b->j + bdiag[i+1]+1;
1672:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1673:         for (j=0; j<nz; j++) {
1674:           col    = pj[j];
1675:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1676:         }

1678:         sctx.rs = rs;
1679:         sctx.pv = rtmp1[i];
1680:         MatPivotCheck(B,A,info,&sctx,i);
1681:         if (sctx.newshift) break;
1682:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1683:         *pc1 = 1.0/sctx.pv;

1685:         /* Now take care of 1st column of diagonal 4x4 block. */
1686:         pc2 = rtmp2 + i;
1687:         pc3 = rtmp3 + i;
1688:         pc4 = rtmp4 + i;
1689:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1690:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1691:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1692:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1693:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1694:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1695:           for (j=0; j<nz; j++) {
1696:             col         = pj[j];
1697:             rtmp2[col] -= mul2 * rtmp1[col];
1698:             rtmp3[col] -= mul3 * rtmp1[col];
1699:             rtmp4[col] -= mul4 * rtmp1[col];
1700:           }
1701:           PetscLogFlops(3+6.0*nz);
1702:         }

1704:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1705:         rs = 0.0;
1706:         /* L part */
1707:         pc2 = b->a + bi[i+1];
1708:         pj  = b->j + bi[i+1];
1709:         nz  = bi[i+2] - bi[i+1];
1710:         for (j=0; j<nz; j++) {
1711:           col    = pj[j];
1712:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1713:         }
1714:         /* U part */
1715:         pc2 = b->a + bdiag[i+2]+1;
1716:         pj  = b->j + bdiag[i+2]+1;
1717:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1718:         for (j=0; j<nz; j++) {
1719:           col    = pj[j];
1720:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1721:         }

1723:         sctx.rs = rs;
1724:         sctx.pv = rtmp2[i+1];
1725:         MatPivotCheck(B,A,info,&sctx,i+1);
1726:         if (sctx.newshift) break;
1727:         pc2  = b->a + bdiag[i+1];
1728:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1730:         /* Now take care of 2nd column of diagonal 4x4 block. */
1731:         pc3 = rtmp3 + i+1;
1732:         pc4 = rtmp4 + i+1;
1733:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1734:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1735:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1736:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1737:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1738:           for (j=0; j<nz; j++) {
1739:             col         = pj[j];
1740:             rtmp3[col] -= mul3 * rtmp2[col];
1741:             rtmp4[col] -= mul4 * rtmp2[col];
1742:           }
1743:           PetscLogFlops(4.0*nz);
1744:         }

1746:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1747:         rs = 0.0;
1748:         /* L part */
1749:         pc3 = b->a + bi[i+2];
1750:         pj  = b->j + bi[i+2];
1751:         nz  = bi[i+3] - bi[i+2];
1752:         for (j=0; j<nz; j++) {
1753:           col    = pj[j];
1754:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1755:         }
1756:         /* U part */
1757:         pc3 = b->a + bdiag[i+3]+1;
1758:         pj  = b->j + bdiag[i+3]+1;
1759:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1760:         for (j=0; j<nz; j++) {
1761:           col    = pj[j];
1762:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1763:         }

1765:         sctx.rs = rs;
1766:         sctx.pv = rtmp3[i+2];
1767:         MatPivotCheck(B,A,info,&sctx,i+2);
1768:         if (sctx.newshift) break;
1769:         pc3  = b->a + bdiag[i+2];
1770:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1772:         /* Now take care of 3rd column of diagonal 4x4 block. */
1773:         pc4 = rtmp4 + i+2;
1774:         if (*pc4 != 0.0) {
1775:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1776:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1777:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1778:           for (j=0; j<nz; j++) {
1779:             col         = pj[j];
1780:             rtmp4[col] -= mul4 * rtmp3[col];
1781:           }
1782:           PetscLogFlops(1+2.0*nz);
1783:         }

1785:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1786:         rs = 0.0;
1787:         /* L part */
1788:         pc4 = b->a + bi[i+3];
1789:         pj  = b->j + bi[i+3];
1790:         nz  = bi[i+4] - bi[i+3];
1791:         for (j=0; j<nz; j++) {
1792:           col    = pj[j];
1793:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1794:         }
1795:         /* U part */
1796:         pc4 = b->a + bdiag[i+4]+1;
1797:         pj  = b->j + bdiag[i+4]+1;
1798:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1799:         for (j=0; j<nz; j++) {
1800:           col    = pj[j];
1801:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1802:         }

1804:         sctx.rs = rs;
1805:         sctx.pv = rtmp4[i+3];
1806:         MatPivotCheck(B,A,info,&sctx,i+3);
1807:         if (sctx.newshift) break;
1808:         pc4  = b->a + bdiag[i+3];
1809:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1810:         break;

1812:       default:
1813:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1814:       }
1815:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1816:       i += nodesz;                 /* Update the row */
1817:     }

1819:     /* MatPivotRefine() */
1820:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1821:       /*
1822:        * if no shift in this attempt & shifting & started shifting & can refine,
1823:        * then try lower shift
1824:        */
1825:       sctx.shift_hi       = sctx.shift_fraction;
1826:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1827:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1828:       sctx.newshift       = PETSC_TRUE;
1829:       sctx.nshift++;
1830:     }
1831:   } while (sctx.newshift);

1833:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1834:   PetscFree(tmp_vec2);
1835:   ISRestoreIndices(isicol,&ic);
1836:   ISRestoreIndices(isrow,&r);

1838:   if (b->inode.size) {
1839:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1840:   } else {
1841:     C->ops->solve           = MatSolve_SeqAIJ;
1842:   }
1843:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1844:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1845:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1846:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1847:   C->assembled              = PETSC_TRUE;
1848:   C->preallocated           = PETSC_TRUE;

1850:   PetscLogFlops(C->cmap->n);

1852:   /* MatShiftView(A,info,&sctx) */
1853:   if (sctx.nshift) {
1854:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1855:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1856:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1857:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1858:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1859:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1860:     }
1861:   }
1862:   return(0);
1863: }

1865: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1866: {
1867:   Mat             C     = B;
1868:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1869:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1870:   PetscErrorCode  ierr;
1871:   const PetscInt  *r,*ic,*c,*ics;
1872:   PetscInt        n   = A->rmap->n,*bi = b->i;
1873:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1874:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1875:   PetscInt        *ai = a->i,*aj = a->j;
1876:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1877:   PetscScalar     mul1,mul2,mul3,tmp;
1878:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1879:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1880:   PetscReal       rs=0.0;
1881:   FactorShiftCtx  sctx;

1884:   sctx.shift_top      = 0;
1885:   sctx.nshift_max     = 0;
1886:   sctx.shift_lo       = 0;
1887:   sctx.shift_hi       = 0;
1888:   sctx.shift_fraction = 0;

1890:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1891:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1892:     sctx.shift_top = 0;
1893:     for (i=0; i<n; i++) {
1894:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1895:       rs    = 0.0;
1896:       ajtmp = aj + ai[i];
1897:       rtmp1 = aa + ai[i];
1898:       nz    = ai[i+1] - ai[i];
1899:       for (j=0; j<nz; j++) {
1900:         if (*ajtmp != i) {
1901:           rs += PetscAbsScalar(*rtmp1++);
1902:         } else {
1903:           rs -= PetscRealPart(*rtmp1++);
1904:         }
1905:         ajtmp++;
1906:       }
1907:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1908:     }
1909:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1910:     sctx.shift_top *= 1.1;
1911:     sctx.nshift_max = 5;
1912:     sctx.shift_lo   = 0.;
1913:     sctx.shift_hi   = 1.;
1914:   }
1915:   sctx.shift_amount = 0;
1916:   sctx.nshift       = 0;

1918:   ISGetIndices(isrow,&r);
1919:   ISGetIndices(iscol,&c);
1920:   ISGetIndices(isicol,&ic);
1921:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1922:   ics    = ic;

1924:   node_max = a->inode.node_count;
1925:   ns       = a->inode.size;
1926:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1928:   /* If max inode size > 3, split it into two inodes.*/
1929:   /* also map the inode sizes according to the ordering */
1930:   PetscMalloc1(n+1,&tmp_vec1);
1931:   for (i=0,j=0; i<node_max; ++i,++j) {
1932:     if (ns[i]>3) {
1933:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1934:       ++j;
1935:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1936:     } else {
1937:       tmp_vec1[j] = ns[i];
1938:     }
1939:   }
1940:   /* Use the correct node_max */
1941:   node_max = j;

1943:   /* Now reorder the inode info based on mat re-ordering info */
1944:   /* First create a row -> inode_size_array_index map */
1945:   PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1946:   for (i=0,row=0; i<node_max; i++) {
1947:     nodesz = tmp_vec1[i];
1948:     for (j=0; j<nodesz; j++,row++) {
1949:       nsmap[row] = i;
1950:     }
1951:   }
1952:   /* Using nsmap, create a reordered ns structure */
1953:   for (i=0,j=0; i< node_max; i++) {
1954:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1955:     tmp_vec2[i] = nodesz;
1956:     j          += nodesz;
1957:   }
1958:   PetscFree2(nsmap,tmp_vec1);
1959:   /* Now use the correct ns */
1960:   ns = tmp_vec2;

1962:   do {
1963:     sctx.newshift = PETSC_FALSE;
1964:     /* Now loop over each block-row, and do the factorization */
1965:     for (i=0,row=0; i<node_max; i++) {
1966:       nodesz = ns[i];
1967:       nz     = bi[row+1] - bi[row];
1968:       bjtmp  = bj + bi[row];

1970:       switch (nodesz) {
1971:       case 1:
1972:         for  (j=0; j<nz; j++) {
1973:           idx         = bjtmp[j];
1974:           rtmp11[idx] = 0.0;
1975:         }

1977:         /* load in initial (unfactored row) */
1978:         idx    = r[row];
1979:         nz_tmp = ai[idx+1] - ai[idx];
1980:         ajtmp  = aj + ai[idx];
1981:         v1     = aa + ai[idx];

1983:         for (j=0; j<nz_tmp; j++) {
1984:           idx         = ics[ajtmp[j]];
1985:           rtmp11[idx] = v1[j];
1986:         }
1987:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1989:         prow = *bjtmp++;
1990:         while (prow < row) {
1991:           pc1 = rtmp11 + prow;
1992:           if (*pc1 != 0.0) {
1993:             pv     = ba + bd[prow];
1994:             pj     = nbj + bd[prow];
1995:             mul1   = *pc1 * *pv++;
1996:             *pc1   = mul1;
1997:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1998:             PetscLogFlops(1+2.0*nz_tmp);
1999:             for (j=0; j<nz_tmp; j++) {
2000:               tmp          = pv[j];
2001:               idx          = pj[j];
2002:               rtmp11[idx] -= mul1 * tmp;
2003:             }
2004:           }
2005:           prow = *bjtmp++;
2006:         }
2007:         pj  = bj + bi[row];
2008:         pc1 = ba + bi[row];

2010:         sctx.pv     = rtmp11[row];
2011:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2012:         rs          = 0.0;
2013:         for (j=0; j<nz; j++) {
2014:           idx    = pj[j];
2015:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2016:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2017:         }
2018:         sctx.rs = rs;
2019:         MatPivotCheck(B,A,info,&sctx,row);
2020:         if (sctx.newshift) goto endofwhile;
2021:         break;

2023:       case 2:
2024:         for (j=0; j<nz; j++) {
2025:           idx         = bjtmp[j];
2026:           rtmp11[idx] = 0.0;
2027:           rtmp22[idx] = 0.0;
2028:         }

2030:         /* load in initial (unfactored row) */
2031:         idx    = r[row];
2032:         nz_tmp = ai[idx+1] - ai[idx];
2033:         ajtmp  = aj + ai[idx];
2034:         v1     = aa + ai[idx];
2035:         v2     = aa + ai[idx+1];
2036:         for (j=0; j<nz_tmp; j++) {
2037:           idx         = ics[ajtmp[j]];
2038:           rtmp11[idx] = v1[j];
2039:           rtmp22[idx] = v2[j];
2040:         }
2041:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2042:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2044:         prow = *bjtmp++;
2045:         while (prow < row) {
2046:           pc1 = rtmp11 + prow;
2047:           pc2 = rtmp22 + prow;
2048:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2049:             pv   = ba + bd[prow];
2050:             pj   = nbj + bd[prow];
2051:             mul1 = *pc1 * *pv;
2052:             mul2 = *pc2 * *pv;
2053:             ++pv;
2054:             *pc1 = mul1;
2055:             *pc2 = mul2;

2057:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2058:             for (j=0; j<nz_tmp; j++) {
2059:               tmp          = pv[j];
2060:               idx          = pj[j];
2061:               rtmp11[idx] -= mul1 * tmp;
2062:               rtmp22[idx] -= mul2 * tmp;
2063:             }
2064:             PetscLogFlops(2+4.0*nz_tmp);
2065:           }
2066:           prow = *bjtmp++;
2067:         }

2069:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2070:         pc1 = rtmp11 + prow;
2071:         pc2 = rtmp22 + prow;

2073:         sctx.pv = *pc1;
2074:         pj      = bj + bi[prow];
2075:         rs      = 0.0;
2076:         for (j=0; j<nz; j++) {
2077:           idx = pj[j];
2078:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2079:         }
2080:         sctx.rs = rs;
2081:         MatPivotCheck(B,A,info,&sctx,row);
2082:         if (sctx.newshift) goto endofwhile;

2084:         if (*pc2 != 0.0) {
2085:           pj     = nbj + bd[prow];
2086:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2087:           *pc2   = mul2;
2088:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2089:           for (j=0; j<nz_tmp; j++) {
2090:             idx          = pj[j];
2091:             tmp          = rtmp11[idx];
2092:             rtmp22[idx] -= mul2 * tmp;
2093:           }
2094:           PetscLogFlops(1+2.0*nz_tmp);
2095:         }

2097:         pj  = bj + bi[row];
2098:         pc1 = ba + bi[row];
2099:         pc2 = ba + bi[row+1];

2101:         sctx.pv       = rtmp22[row+1];
2102:         rs            = 0.0;
2103:         rtmp11[row]   = 1.0/rtmp11[row];
2104:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2105:         /* copy row entries from dense representation to sparse */
2106:         for (j=0; j<nz; j++) {
2107:           idx    = pj[j];
2108:           pc1[j] = rtmp11[idx];
2109:           pc2[j] = rtmp22[idx];
2110:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2111:         }
2112:         sctx.rs = rs;
2113:         MatPivotCheck(B,A,info,&sctx,row+1);
2114:         if (sctx.newshift) goto endofwhile;
2115:         break;

2117:       case 3:
2118:         for  (j=0; j<nz; j++) {
2119:           idx         = bjtmp[j];
2120:           rtmp11[idx] = 0.0;
2121:           rtmp22[idx] = 0.0;
2122:           rtmp33[idx] = 0.0;
2123:         }
2124:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2125:         idx    = r[row];
2126:         nz_tmp = ai[idx+1] - ai[idx];
2127:         ajtmp  = aj + ai[idx];
2128:         v1     = aa + ai[idx];
2129:         v2     = aa + ai[idx+1];
2130:         v3     = aa + ai[idx+2];
2131:         for (j=0; j<nz_tmp; j++) {
2132:           idx         = ics[ajtmp[j]];
2133:           rtmp11[idx] = v1[j];
2134:           rtmp22[idx] = v2[j];
2135:           rtmp33[idx] = v3[j];
2136:         }
2137:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2138:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2139:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2141:         /* loop over all pivot row blocks above this row block */
2142:         prow = *bjtmp++;
2143:         while (prow < row) {
2144:           pc1 = rtmp11 + prow;
2145:           pc2 = rtmp22 + prow;
2146:           pc3 = rtmp33 + prow;
2147:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2148:             pv   = ba  + bd[prow];
2149:             pj   = nbj + bd[prow];
2150:             mul1 = *pc1 * *pv;
2151:             mul2 = *pc2 * *pv;
2152:             mul3 = *pc3 * *pv;
2153:             ++pv;
2154:             *pc1 = mul1;
2155:             *pc2 = mul2;
2156:             *pc3 = mul3;

2158:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2159:             /* update this row based on pivot row */
2160:             for (j=0; j<nz_tmp; j++) {
2161:               tmp          = pv[j];
2162:               idx          = pj[j];
2163:               rtmp11[idx] -= mul1 * tmp;
2164:               rtmp22[idx] -= mul2 * tmp;
2165:               rtmp33[idx] -= mul3 * tmp;
2166:             }
2167:             PetscLogFlops(3+6.0*nz_tmp);
2168:           }
2169:           prow = *bjtmp++;
2170:         }

2172:         /* Now take care of diagonal 3x3 block in this set of rows */
2173:         /* note: prow = row here */
2174:         pc1 = rtmp11 + prow;
2175:         pc2 = rtmp22 + prow;
2176:         pc3 = rtmp33 + prow;

2178:         sctx.pv = *pc1;
2179:         pj      = bj + bi[prow];
2180:         rs      = 0.0;
2181:         for (j=0; j<nz; j++) {
2182:           idx = pj[j];
2183:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2184:         }
2185:         sctx.rs = rs;
2186:         MatPivotCheck(B,A,info,&sctx,row);
2187:         if (sctx.newshift) goto endofwhile;

2189:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2190:           mul2   = (*pc2)/(*pc1);
2191:           mul3   = (*pc3)/(*pc1);
2192:           *pc2   = mul2;
2193:           *pc3   = mul3;
2194:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2195:           pj     = nbj + bd[prow];
2196:           for (j=0; j<nz_tmp; j++) {
2197:             idx          = pj[j];
2198:             tmp          = rtmp11[idx];
2199:             rtmp22[idx] -= mul2 * tmp;
2200:             rtmp33[idx] -= mul3 * tmp;
2201:           }
2202:           PetscLogFlops(2+4.0*nz_tmp);
2203:         }
2204:         ++prow;

2206:         pc2     = rtmp22 + prow;
2207:         pc3     = rtmp33 + prow;
2208:         sctx.pv = *pc2;
2209:         pj      = bj + bi[prow];
2210:         rs      = 0.0;
2211:         for (j=0; j<nz; j++) {
2212:           idx = pj[j];
2213:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2214:         }
2215:         sctx.rs = rs;
2216:         MatPivotCheck(B,A,info,&sctx,row+1);
2217:         if (sctx.newshift) goto endofwhile;

2219:         if (*pc3 != 0.0) {
2220:           mul3   = (*pc3)/(*pc2);
2221:           *pc3   = mul3;
2222:           pj     = nbj + bd[prow];
2223:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224:           for (j=0; j<nz_tmp; j++) {
2225:             idx          = pj[j];
2226:             tmp          = rtmp22[idx];
2227:             rtmp33[idx] -= mul3 * tmp;
2228:           }
2229:           PetscLogFlops(1+2.0*nz_tmp);
2230:         }

2232:         pj  = bj + bi[row];
2233:         pc1 = ba + bi[row];
2234:         pc2 = ba + bi[row+1];
2235:         pc3 = ba + bi[row+2];

2237:         sctx.pv       = rtmp33[row+2];
2238:         rs            = 0.0;
2239:         rtmp11[row]   = 1.0/rtmp11[row];
2240:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2241:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2242:         /* copy row entries from dense representation to sparse */
2243:         for (j=0; j<nz; j++) {
2244:           idx    = pj[j];
2245:           pc1[j] = rtmp11[idx];
2246:           pc2[j] = rtmp22[idx];
2247:           pc3[j] = rtmp33[idx];
2248:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2249:         }

2251:         sctx.rs = rs;
2252:         MatPivotCheck(B,A,info,&sctx,row+2);
2253:         if (sctx.newshift) goto endofwhile;
2254:         break;

2256:       default:
2257:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2258:       }
2259:       row += nodesz;                 /* Update the row */
2260:     }
2261: endofwhile:;
2262:   } while (sctx.newshift);
2263:   PetscFree3(rtmp11,rtmp22,rtmp33);
2264:   PetscFree(tmp_vec2);
2265:   ISRestoreIndices(isicol,&ic);
2266:   ISRestoreIndices(isrow,&r);
2267:   ISRestoreIndices(iscol,&c);

2269:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2270:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2271:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2272:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2273:   C->assembled              = PETSC_TRUE;
2274:   C->preallocated           = PETSC_TRUE;
2275:   if (sctx.nshift) {
2276:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2277:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2278:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2279:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2280:     }
2281:   }
2282:   PetscLogFlops(C->cmap->n);
2283:   MatSeqAIJCheckInode(C);
2284:   return(0);
2285: }


2288: /* ----------------------------------------------------------- */
2289: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2290: {
2291:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2292:   IS                iscol = a->col,isrow = a->row;
2293:   PetscErrorCode    ierr;
2294:   const PetscInt    *r,*c,*rout,*cout;
2295:   PetscInt          i,j,n = A->rmap->n;
2296:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2297:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2298:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2299:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2300:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2301:   const PetscScalar *b;

2304:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2305:   node_max = a->inode.node_count;
2306:   ns       = a->inode.size;     /* Node Size array */

2308:   VecGetArrayRead(bb,&b);
2309:   VecGetArrayWrite(xx,&x);
2310:   tmp  = a->solve_work;

2312:   ISGetIndices(isrow,&rout); r = rout;
2313:   ISGetIndices(iscol,&cout); c = cout;

2315:   /* forward solve the lower triangular */
2316:   tmps = tmp;
2317:   aa   = a_a;
2318:   aj   = a_j;
2319:   ad   = a->diag;

2321:   for (i = 0,row = 0; i< node_max; ++i) {
2322:     nsz = ns[i];
2323:     aii = ai[row];
2324:     v1  = aa + aii;
2325:     vi  = aj + aii;
2326:     nz  = ai[row+1]- ai[row];

2328:     if (i < node_max-1) {
2329:       /* Prefetch the indices for the next block */
2330:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2331:       /* Prefetch the data for the next block */
2332:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2333:     }

2335:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2336:     case 1:
2337:       sum1 = b[r[row]];
2338:       for (j=0; j<nz-1; j+=2) {
2339:         i0    = vi[j];
2340:         i1    = vi[j+1];
2341:         tmp0  = tmps[i0];
2342:         tmp1  = tmps[i1];
2343:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2344:       }
2345:       if (j == nz-1) {
2346:         tmp0  = tmps[vi[j]];
2347:         sum1 -= v1[j]*tmp0;
2348:       }
2349:       tmp[row++]=sum1;
2350:       break;
2351:     case 2:
2352:       sum1 = b[r[row]];
2353:       sum2 = b[r[row+1]];
2354:       v2   = aa + ai[row+1];

2356:       for (j=0; j<nz-1; j+=2) {
2357:         i0    = vi[j];
2358:         i1    = vi[j+1];
2359:         tmp0  = tmps[i0];
2360:         tmp1  = tmps[i1];
2361:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2362:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2363:       }
2364:       if (j == nz-1) {
2365:         tmp0  = tmps[vi[j]];
2366:         sum1 -= v1[j] *tmp0;
2367:         sum2 -= v2[j] *tmp0;
2368:       }
2369:       sum2     -= v2[nz] * sum1;
2370:       tmp[row++]=sum1;
2371:       tmp[row++]=sum2;
2372:       break;
2373:     case 3:
2374:       sum1 = b[r[row]];
2375:       sum2 = b[r[row+1]];
2376:       sum3 = b[r[row+2]];
2377:       v2   = aa + ai[row+1];
2378:       v3   = aa + ai[row+2];

2380:       for (j=0; j<nz-1; j+=2) {
2381:         i0    = vi[j];
2382:         i1    = vi[j+1];
2383:         tmp0  = tmps[i0];
2384:         tmp1  = tmps[i1];
2385:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2386:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2387:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2388:       }
2389:       if (j == nz-1) {
2390:         tmp0  = tmps[vi[j]];
2391:         sum1 -= v1[j] *tmp0;
2392:         sum2 -= v2[j] *tmp0;
2393:         sum3 -= v3[j] *tmp0;
2394:       }
2395:       sum2     -= v2[nz] * sum1;
2396:       sum3     -= v3[nz] * sum1;
2397:       sum3     -= v3[nz+1] * sum2;
2398:       tmp[row++]=sum1;
2399:       tmp[row++]=sum2;
2400:       tmp[row++]=sum3;
2401:       break;

2403:     case 4:
2404:       sum1 = b[r[row]];
2405:       sum2 = b[r[row+1]];
2406:       sum3 = b[r[row+2]];
2407:       sum4 = b[r[row+3]];
2408:       v2   = aa + ai[row+1];
2409:       v3   = aa + ai[row+2];
2410:       v4   = aa + ai[row+3];

2412:       for (j=0; j<nz-1; j+=2) {
2413:         i0    = vi[j];
2414:         i1    = vi[j+1];
2415:         tmp0  = tmps[i0];
2416:         tmp1  = tmps[i1];
2417:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2418:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2419:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2420:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2421:       }
2422:       if (j == nz-1) {
2423:         tmp0  = tmps[vi[j]];
2424:         sum1 -= v1[j] *tmp0;
2425:         sum2 -= v2[j] *tmp0;
2426:         sum3 -= v3[j] *tmp0;
2427:         sum4 -= v4[j] *tmp0;
2428:       }
2429:       sum2 -= v2[nz] * sum1;
2430:       sum3 -= v3[nz] * sum1;
2431:       sum4 -= v4[nz] * sum1;
2432:       sum3 -= v3[nz+1] * sum2;
2433:       sum4 -= v4[nz+1] * sum2;
2434:       sum4 -= v4[nz+2] * sum3;

2436:       tmp[row++]=sum1;
2437:       tmp[row++]=sum2;
2438:       tmp[row++]=sum3;
2439:       tmp[row++]=sum4;
2440:       break;
2441:     case 5:
2442:       sum1 = b[r[row]];
2443:       sum2 = b[r[row+1]];
2444:       sum3 = b[r[row+2]];
2445:       sum4 = b[r[row+3]];
2446:       sum5 = b[r[row+4]];
2447:       v2   = aa + ai[row+1];
2448:       v3   = aa + ai[row+2];
2449:       v4   = aa + ai[row+3];
2450:       v5   = aa + ai[row+4];

2452:       for (j=0; j<nz-1; j+=2) {
2453:         i0    = vi[j];
2454:         i1    = vi[j+1];
2455:         tmp0  = tmps[i0];
2456:         tmp1  = tmps[i1];
2457:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2459:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2460:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2461:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2462:       }
2463:       if (j == nz-1) {
2464:         tmp0  = tmps[vi[j]];
2465:         sum1 -= v1[j] *tmp0;
2466:         sum2 -= v2[j] *tmp0;
2467:         sum3 -= v3[j] *tmp0;
2468:         sum4 -= v4[j] *tmp0;
2469:         sum5 -= v5[j] *tmp0;
2470:       }

2472:       sum2 -= v2[nz] * sum1;
2473:       sum3 -= v3[nz] * sum1;
2474:       sum4 -= v4[nz] * sum1;
2475:       sum5 -= v5[nz] * sum1;
2476:       sum3 -= v3[nz+1] * sum2;
2477:       sum4 -= v4[nz+1] * sum2;
2478:       sum5 -= v5[nz+1] * sum2;
2479:       sum4 -= v4[nz+2] * sum3;
2480:       sum5 -= v5[nz+2] * sum3;
2481:       sum5 -= v5[nz+3] * sum4;

2483:       tmp[row++]=sum1;
2484:       tmp[row++]=sum2;
2485:       tmp[row++]=sum3;
2486:       tmp[row++]=sum4;
2487:       tmp[row++]=sum5;
2488:       break;
2489:     default:
2490:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2491:     }
2492:   }
2493:   /* backward solve the upper triangular */
2494:   for (i=node_max -1,row = n-1; i>=0; i--) {
2495:     nsz = ns[i];
2496:     aii = ad[row+1] + 1;
2497:     v1  = aa + aii;
2498:     vi  = aj + aii;
2499:     nz  = ad[row]- ad[row+1] - 1;

2501:     if (i > 0) {
2502:       /* Prefetch the indices for the next block */
2503:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504:       /* Prefetch the data for the next block */
2505:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2506:     }

2508:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2509:     case 1:
2510:       sum1 = tmp[row];

2512:       for (j=0; j<nz-1; j+=2) {
2513:         i0    = vi[j];
2514:         i1    = vi[j+1];
2515:         tmp0  = tmps[i0];
2516:         tmp1  = tmps[i1];
2517:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2518:       }
2519:       if (j == nz-1) {
2520:         tmp0  = tmps[vi[j]];
2521:         sum1 -= v1[j]*tmp0;
2522:       }
2523:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2524:       break;
2525:     case 2:
2526:       sum1 = tmp[row];
2527:       sum2 = tmp[row-1];
2528:       v2   = aa + ad[row] + 1;
2529:       for (j=0; j<nz-1; j+=2) {
2530:         i0    = vi[j];
2531:         i1    = vi[j+1];
2532:         tmp0  = tmps[i0];
2533:         tmp1  = tmps[i1];
2534:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2535:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2536:       }
2537:       if (j == nz-1) {
2538:         tmp0  = tmps[vi[j]];
2539:         sum1 -= v1[j]* tmp0;
2540:         sum2 -= v2[j+1]* tmp0;
2541:       }

2543:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2544:       sum2     -= v2[0] * tmp0;
2545:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2546:       break;
2547:     case 3:
2548:       sum1 = tmp[row];
2549:       sum2 = tmp[row -1];
2550:       sum3 = tmp[row -2];
2551:       v2   = aa + ad[row] + 1;
2552:       v3   = aa + ad[row -1] + 1;
2553:       for (j=0; j<nz-1; j+=2) {
2554:         i0    = vi[j];
2555:         i1    = vi[j+1];
2556:         tmp0  = tmps[i0];
2557:         tmp1  = tmps[i1];
2558:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2560:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2561:       }
2562:       if (j== nz-1) {
2563:         tmp0  = tmps[vi[j]];
2564:         sum1 -= v1[j] * tmp0;
2565:         sum2 -= v2[j+1] * tmp0;
2566:         sum3 -= v3[j+2] * tmp0;
2567:       }
2568:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2569:       sum2     -= v2[0]* tmp0;
2570:       sum3     -= v3[1] * tmp0;
2571:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2572:       sum3     -= v3[0]* tmp0;
2573:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2575:       break;
2576:     case 4:
2577:       sum1 = tmp[row];
2578:       sum2 = tmp[row -1];
2579:       sum3 = tmp[row -2];
2580:       sum4 = tmp[row -3];
2581:       v2   = aa + ad[row]+1;
2582:       v3   = aa + ad[row -1]+1;
2583:       v4   = aa + ad[row -2]+1;

2585:       for (j=0; j<nz-1; j+=2) {
2586:         i0    = vi[j];
2587:         i1    = vi[j+1];
2588:         tmp0  = tmps[i0];
2589:         tmp1  = tmps[i1];
2590:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2591:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2592:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2593:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2594:       }
2595:       if (j== nz-1) {
2596:         tmp0  = tmps[vi[j]];
2597:         sum1 -= v1[j] * tmp0;
2598:         sum2 -= v2[j+1] * tmp0;
2599:         sum3 -= v3[j+2] * tmp0;
2600:         sum4 -= v4[j+3] * tmp0;
2601:       }

2603:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604:       sum2     -= v2[0] * tmp0;
2605:       sum3     -= v3[1] * tmp0;
2606:       sum4     -= v4[2] * tmp0;
2607:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608:       sum3     -= v3[0] * tmp0;
2609:       sum4     -= v4[1] * tmp0;
2610:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2611:       sum4     -= v4[0] * tmp0;
2612:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2613:       break;
2614:     case 5:
2615:       sum1 = tmp[row];
2616:       sum2 = tmp[row -1];
2617:       sum3 = tmp[row -2];
2618:       sum4 = tmp[row -3];
2619:       sum5 = tmp[row -4];
2620:       v2   = aa + ad[row]+1;
2621:       v3   = aa + ad[row -1]+1;
2622:       v4   = aa + ad[row -2]+1;
2623:       v5   = aa + ad[row -3]+1;
2624:       for (j=0; j<nz-1; j+=2) {
2625:         i0    = vi[j];
2626:         i1    = vi[j+1];
2627:         tmp0  = tmps[i0];
2628:         tmp1  = tmps[i1];
2629:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2634:       }
2635:       if (j==nz-1) {
2636:         tmp0  = tmps[vi[j]];
2637:         sum1 -= v1[j] * tmp0;
2638:         sum2 -= v2[j+1] * tmp0;
2639:         sum3 -= v3[j+2] * tmp0;
2640:         sum4 -= v4[j+3] * tmp0;
2641:         sum5 -= v5[j+4] * tmp0;
2642:       }

2644:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645:       sum2     -= v2[0] * tmp0;
2646:       sum3     -= v3[1] * tmp0;
2647:       sum4     -= v4[2] * tmp0;
2648:       sum5     -= v5[3] * tmp0;
2649:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650:       sum3     -= v3[0] * tmp0;
2651:       sum4     -= v4[1] * tmp0;
2652:       sum5     -= v5[2] * tmp0;
2653:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2654:       sum4     -= v4[0] * tmp0;
2655:       sum5     -= v5[1] * tmp0;
2656:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2657:       sum5     -= v5[0] * tmp0;
2658:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2659:       break;
2660:     default:
2661:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2662:     }
2663:   }
2664:   ISRestoreIndices(isrow,&rout);
2665:   ISRestoreIndices(iscol,&cout);
2666:   VecRestoreArrayRead(bb,&b);
2667:   VecRestoreArrayWrite(xx,&x);
2668:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2669:   return(0);
2670: }


2673: /*
2674:      Makes a longer coloring[] array and calls the usual code with that
2675: */
2676: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2677: {
2678:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2679:   PetscErrorCode  ierr;
2680:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2681:   PetscInt        *colorused,i;
2682:   ISColoringValue *newcolor;

2685:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2686:   PetscMalloc1(n+1,&newcolor);
2687:   /* loop over inodes, marking a color for each column*/
2688:   row = 0;
2689:   for (i=0; i<m; i++) {
2690:     for (j=0; j<ns[i]; j++) {
2691:       newcolor[row++] = coloring[i] + j*ncolors;
2692:     }
2693:   }

2695:   /* eliminate unneeded colors */
2696:   PetscCalloc1(5*ncolors,&colorused);
2697:   for (i=0; i<n; i++) {
2698:     colorused[newcolor[i]] = 1;
2699:   }

2701:   for (i=1; i<5*ncolors; i++) {
2702:     colorused[i] += colorused[i-1];
2703:   }
2704:   ncolors = colorused[5*ncolors-1];
2705:   for (i=0; i<n; i++) {
2706:     newcolor[i] = colorused[newcolor[i]]-1;
2707:   }
2708:   PetscFree(colorused);
2709:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2710:   PetscFree(coloring);
2711:   return(0);
2712: }

2714: #include <petsc/private/kernels/blockinvert.h>

2716: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2717: {
2718:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2719:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2720:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2721:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2722:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2723:   const PetscScalar *xb, *b;
2724:   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2725:   PetscErrorCode    ierr;
2726:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2727:   PetscInt          sz,k,ipvt[5];
2728:   PetscBool         allowzeropivot,zeropivotdetected;
2729:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2732:   allowzeropivot = PetscNot(A->erroriffailure);
2733:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2734:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2735:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2737:   if (!a->inode.ibdiagvalid) {
2738:     if (!a->inode.ibdiag) {
2739:       /* calculate space needed for diagonal blocks */
2740:       for (i=0; i<m; i++) {
2741:         cnt += sizes[i]*sizes[i];
2742:       }
2743:       a->inode.bdiagsize = cnt;

2745:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2746:     }

2748:     /* copy over the diagonal blocks and invert them */
2749:     ibdiag = a->inode.ibdiag;
2750:     bdiag  = a->inode.bdiag;
2751:     cnt    = 0;
2752:     for (i=0, row = 0; i<m; i++) {
2753:       for (j=0; j<sizes[i]; j++) {
2754:         for (k=0; k<sizes[i]; k++) {
2755:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2756:         }
2757:       }
2758:       PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);

2760:       switch (sizes[i]) {
2761:       case 1:
2762:         /* Create matrix data structure */
2763:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2764:           if (allowzeropivot) {
2765:             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2766:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2767:             A->factorerror_zeropivot_row   = row;
2768:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2769:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2770:         }
2771:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2772:         break;
2773:       case 2:
2774:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2775:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2776:         break;
2777:       case 3:
2778:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2779:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2780:         break;
2781:       case 4:
2782:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2783:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2784:         break;
2785:       case 5:
2786:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2787:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2788:         break;
2789:       default:
2790:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2791:       }
2792:       cnt += sizes[i]*sizes[i];
2793:       row += sizes[i];
2794:     }
2795:     a->inode.ibdiagvalid = PETSC_TRUE;
2796:   }
2797:   ibdiag = a->inode.ibdiag;
2798:   bdiag  = a->inode.bdiag;
2799:   t      = a->inode.ssor_work;

2801:   VecGetArray(xx,&x);
2802:   VecGetArrayRead(bb,&b);
2803:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2804:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2805:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2807:       for (i=0, row=0; i<m; i++) {
2808:         sz  = diag[row] - ii[row];
2809:         v1  = a->a + ii[row];
2810:         idx = a->j + ii[row];

2812:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2813:         switch (sizes[i]) {
2814:         case 1:

2816:           sum1 = b[row];
2817:           for (n = 0; n<sz-1; n+=2) {
2818:             i1    = idx[0];
2819:             i2    = idx[1];
2820:             idx  += 2;
2821:             tmp0  = x[i1];
2822:             tmp1  = x[i2];
2823:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2824:           }

2826:           if (n == sz-1) {
2827:             tmp0  = x[*idx];
2828:             sum1 -= *v1 * tmp0;
2829:           }
2830:           t[row]   = sum1;
2831:           x[row++] = sum1*(*ibdiag++);
2832:           break;
2833:         case 2:
2834:           v2   = a->a + ii[row+1];
2835:           sum1 = b[row];
2836:           sum2 = b[row+1];
2837:           for (n = 0; n<sz-1; n+=2) {
2838:             i1    = idx[0];
2839:             i2    = idx[1];
2840:             idx  += 2;
2841:             tmp0  = x[i1];
2842:             tmp1  = x[i2];
2843:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2844:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2845:           }

2847:           if (n == sz-1) {
2848:             tmp0  = x[*idx];
2849:             sum1 -= v1[0] * tmp0;
2850:             sum2 -= v2[0] * tmp0;
2851:           }
2852:           t[row]   = sum1;
2853:           t[row+1] = sum2;
2854:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2855:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2856:           ibdiag  += 4;
2857:           break;
2858:         case 3:
2859:           v2   = a->a + ii[row+1];
2860:           v3   = a->a + ii[row+2];
2861:           sum1 = b[row];
2862:           sum2 = b[row+1];
2863:           sum3 = b[row+2];
2864:           for (n = 0; n<sz-1; n+=2) {
2865:             i1    = idx[0];
2866:             i2    = idx[1];
2867:             idx  += 2;
2868:             tmp0  = x[i1];
2869:             tmp1  = x[i2];
2870:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2871:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2872:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2873:           }

2875:           if (n == sz-1) {
2876:             tmp0  = x[*idx];
2877:             sum1 -= v1[0] * tmp0;
2878:             sum2 -= v2[0] * tmp0;
2879:             sum3 -= v3[0] * tmp0;
2880:           }
2881:           t[row]   = sum1;
2882:           t[row+1] = sum2;
2883:           t[row+2] = sum3;
2884:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2885:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2886:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2887:           ibdiag  += 9;
2888:           break;
2889:         case 4:
2890:           v2   = a->a + ii[row+1];
2891:           v3   = a->a + ii[row+2];
2892:           v4   = a->a + ii[row+3];
2893:           sum1 = b[row];
2894:           sum2 = b[row+1];
2895:           sum3 = b[row+2];
2896:           sum4 = b[row+3];
2897:           for (n = 0; n<sz-1; n+=2) {
2898:             i1    = idx[0];
2899:             i2    = idx[1];
2900:             idx  += 2;
2901:             tmp0  = x[i1];
2902:             tmp1  = x[i2];
2903:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2907:           }

2909:           if (n == sz-1) {
2910:             tmp0  = x[*idx];
2911:             sum1 -= v1[0] * tmp0;
2912:             sum2 -= v2[0] * tmp0;
2913:             sum3 -= v3[0] * tmp0;
2914:             sum4 -= v4[0] * tmp0;
2915:           }
2916:           t[row]   = sum1;
2917:           t[row+1] = sum2;
2918:           t[row+2] = sum3;
2919:           t[row+3] = sum4;
2920:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2921:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2922:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2923:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2924:           ibdiag  += 16;
2925:           break;
2926:         case 5:
2927:           v2   = a->a + ii[row+1];
2928:           v3   = a->a + ii[row+2];
2929:           v4   = a->a + ii[row+3];
2930:           v5   = a->a + ii[row+4];
2931:           sum1 = b[row];
2932:           sum2 = b[row+1];
2933:           sum3 = b[row+2];
2934:           sum4 = b[row+3];
2935:           sum5 = b[row+4];
2936:           for (n = 0; n<sz-1; n+=2) {
2937:             i1    = idx[0];
2938:             i2    = idx[1];
2939:             idx  += 2;
2940:             tmp0  = x[i1];
2941:             tmp1  = x[i2];
2942:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2943:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2944:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2945:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2946:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2947:           }

2949:           if (n == sz-1) {
2950:             tmp0  = x[*idx];
2951:             sum1 -= v1[0] * tmp0;
2952:             sum2 -= v2[0] * tmp0;
2953:             sum3 -= v3[0] * tmp0;
2954:             sum4 -= v4[0] * tmp0;
2955:             sum5 -= v5[0] * tmp0;
2956:           }
2957:           t[row]   = sum1;
2958:           t[row+1] = sum2;
2959:           t[row+2] = sum3;
2960:           t[row+3] = sum4;
2961:           t[row+4] = sum5;
2962:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2963:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2964:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2965:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2966:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2967:           ibdiag  += 25;
2968:           break;
2969:         default:
2970:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2971:         }
2972:       }

2974:       xb   = t;
2975:       PetscLogFlops(a->nz);
2976:     } else xb = b;
2977:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

2979:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2980:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2981:         ibdiag -= sizes[i]*sizes[i];
2982:         sz      = ii[row+1] - diag[row] - 1;
2983:         v1      = a->a + diag[row] + 1;
2984:         idx     = a->j + diag[row] + 1;

2986:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2987:         switch (sizes[i]) {
2988:         case 1:

2990:           sum1 = xb[row];
2991:           for (n = 0; n<sz-1; n+=2) {
2992:             i1    = idx[0];
2993:             i2    = idx[1];
2994:             idx  += 2;
2995:             tmp0  = x[i1];
2996:             tmp1  = x[i2];
2997:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2998:           }

3000:           if (n == sz-1) {
3001:             tmp0  = x[*idx];
3002:             sum1 -= *v1*tmp0;
3003:           }
3004:           x[row--] = sum1*(*ibdiag);
3005:           break;

3007:         case 2:

3009:           sum1 = xb[row];
3010:           sum2 = xb[row-1];
3011:           /* note that sum1 is associated with the second of the two rows */
3012:           v2 = a->a + diag[row-1] + 2;
3013:           for (n = 0; n<sz-1; n+=2) {
3014:             i1    = idx[0];
3015:             i2    = idx[1];
3016:             idx  += 2;
3017:             tmp0  = x[i1];
3018:             tmp1  = x[i2];
3019:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3020:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3021:           }

3023:           if (n == sz-1) {
3024:             tmp0  = x[*idx];
3025:             sum1 -= *v1*tmp0;
3026:             sum2 -= *v2*tmp0;
3027:           }
3028:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3029:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3030:           break;
3031:         case 3:

3033:           sum1 = xb[row];
3034:           sum2 = xb[row-1];
3035:           sum3 = xb[row-2];
3036:           v2   = a->a + diag[row-1] + 2;
3037:           v3   = a->a + diag[row-2] + 3;
3038:           for (n = 0; n<sz-1; n+=2) {
3039:             i1    = idx[0];
3040:             i2    = idx[1];
3041:             idx  += 2;
3042:             tmp0  = x[i1];
3043:             tmp1  = x[i2];
3044:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3047:           }

3049:           if (n == sz-1) {
3050:             tmp0  = x[*idx];
3051:             sum1 -= *v1*tmp0;
3052:             sum2 -= *v2*tmp0;
3053:             sum3 -= *v3*tmp0;
3054:           }
3055:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3056:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3057:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3058:           break;
3059:         case 4:

3061:           sum1 = xb[row];
3062:           sum2 = xb[row-1];
3063:           sum3 = xb[row-2];
3064:           sum4 = xb[row-3];
3065:           v2   = a->a + diag[row-1] + 2;
3066:           v3   = a->a + diag[row-2] + 3;
3067:           v4   = a->a + diag[row-3] + 4;
3068:           for (n = 0; n<sz-1; n+=2) {
3069:             i1    = idx[0];
3070:             i2    = idx[1];
3071:             idx  += 2;
3072:             tmp0  = x[i1];
3073:             tmp1  = x[i2];
3074:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3078:           }

3080:           if (n == sz-1) {
3081:             tmp0  = x[*idx];
3082:             sum1 -= *v1*tmp0;
3083:             sum2 -= *v2*tmp0;
3084:             sum3 -= *v3*tmp0;
3085:             sum4 -= *v4*tmp0;
3086:           }
3087:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3088:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3089:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3090:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3091:           break;
3092:         case 5:

3094:           sum1 = xb[row];
3095:           sum2 = xb[row-1];
3096:           sum3 = xb[row-2];
3097:           sum4 = xb[row-3];
3098:           sum5 = xb[row-4];
3099:           v2   = a->a + diag[row-1] + 2;
3100:           v3   = a->a + diag[row-2] + 3;
3101:           v4   = a->a + diag[row-3] + 4;
3102:           v5   = a->a + diag[row-4] + 5;
3103:           for (n = 0; n<sz-1; n+=2) {
3104:             i1    = idx[0];
3105:             i2    = idx[1];
3106:             idx  += 2;
3107:             tmp0  = x[i1];
3108:             tmp1  = x[i2];
3109:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3110:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3111:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3112:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3113:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3114:           }

3116:           if (n == sz-1) {
3117:             tmp0  = x[*idx];
3118:             sum1 -= *v1*tmp0;
3119:             sum2 -= *v2*tmp0;
3120:             sum3 -= *v3*tmp0;
3121:             sum4 -= *v4*tmp0;
3122:             sum5 -= *v5*tmp0;
3123:           }
3124:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3125:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3126:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3127:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3128:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3129:           break;
3130:         default:
3131:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3132:         }
3133:       }

3135:       PetscLogFlops(a->nz);
3136:     }
3137:     its--;
3138:   }
3139:   while (its--) {

3141:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3142:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3143:            i<m;
3144:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3146:         sz  = diag[row] - ii[row];
3147:         v1  = a->a + ii[row];
3148:         idx = a->j + ii[row];
3149:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3150:         switch (sizes[i]) {
3151:         case 1:
3152:           sum1 = b[row];
3153:           for (n = 0; n<sz-1; n+=2) {
3154:             i1    = idx[0];
3155:             i2    = idx[1];
3156:             idx  += 2;
3157:             tmp0  = x[i1];
3158:             tmp1  = x[i2];
3159:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3160:           }
3161:           if (n == sz-1) {
3162:             tmp0  = x[*idx++];
3163:             sum1 -= *v1 * tmp0;
3164:             v1++;
3165:           }
3166:           t[row]   = sum1;
3167:           sz      = ii[row+1] - diag[row] - 1;
3168:           idx     = a->j + diag[row] + 1;
3169:           v1 += 1;
3170:           for (n = 0; n<sz-1; n+=2) {
3171:             i1    = idx[0];
3172:             i2    = idx[1];
3173:             idx  += 2;
3174:             tmp0  = x[i1];
3175:             tmp1  = x[i2];
3176:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3177:           }
3178:           if (n == sz-1) {
3179:             tmp0  = x[*idx++];
3180:             sum1 -= *v1 * tmp0;
3181:           }
3182:           /* in MatSOR_SeqAIJ this line would be
3183:            *
3184:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3185:            *
3186:            * but omega == 1, so this becomes
3187:            *
3188:            * x[row] = sum1*(*ibdiag++);
3189:            *
3190:            */
3191:           x[row] = sum1*(*ibdiag);
3192:           break;
3193:         case 2:
3194:           v2   = a->a + ii[row+1];
3195:           sum1 = b[row];
3196:           sum2 = b[row+1];
3197:           for (n = 0; n<sz-1; n+=2) {
3198:             i1    = idx[0];
3199:             i2    = idx[1];
3200:             idx  += 2;
3201:             tmp0  = x[i1];
3202:             tmp1  = x[i2];
3203:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3204:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3205:           }
3206:           if (n == sz-1) {
3207:             tmp0  = x[*idx++];
3208:             sum1 -= v1[0] * tmp0;
3209:             sum2 -= v2[0] * tmp0;
3210:             v1++; v2++;
3211:           }
3212:           t[row]   = sum1;
3213:           t[row+1] = sum2;
3214:           sz      = ii[row+1] - diag[row] - 2;
3215:           idx     = a->j + diag[row] + 2;
3216:           v1 += 2;
3217:           v2 += 2;
3218:           for (n = 0; n<sz-1; n+=2) {
3219:             i1    = idx[0];
3220:             i2    = idx[1];
3221:             idx  += 2;
3222:             tmp0  = x[i1];
3223:             tmp1  = x[i2];
3224:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226:           }
3227:           if (n == sz-1) {
3228:             tmp0  = x[*idx];
3229:             sum1 -= v1[0] * tmp0;
3230:             sum2 -= v2[0] * tmp0;
3231:           }
3232:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3233:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3234:           break;
3235:         case 3:
3236:           v2   = a->a + ii[row+1];
3237:           v3   = a->a + ii[row+2];
3238:           sum1 = b[row];
3239:           sum2 = b[row+1];
3240:           sum3 = b[row+2];
3241:           for (n = 0; n<sz-1; n+=2) {
3242:             i1    = idx[0];
3243:             i2    = idx[1];
3244:             idx  += 2;
3245:             tmp0  = x[i1];
3246:             tmp1  = x[i2];
3247:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250:           }
3251:           if (n == sz-1) {
3252:             tmp0  = x[*idx++];
3253:             sum1 -= v1[0] * tmp0;
3254:             sum2 -= v2[0] * tmp0;
3255:             sum3 -= v3[0] * tmp0;
3256:             v1++; v2++; v3++;
3257:           }
3258:           t[row]   = sum1;
3259:           t[row+1] = sum2;
3260:           t[row+2] = sum3;
3261:           sz      = ii[row+1] - diag[row] - 3;
3262:           idx     = a->j + diag[row] + 3;
3263:           v1 += 3;
3264:           v2 += 3;
3265:           v3 += 3;
3266:           for (n = 0; n<sz-1; n+=2) {
3267:             i1    = idx[0];
3268:             i2    = idx[1];
3269:             idx  += 2;
3270:             tmp0  = x[i1];
3271:             tmp1  = x[i2];
3272:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275:           }
3276:           if (n == sz-1) {
3277:             tmp0  = x[*idx];
3278:             sum1 -= v1[0] * tmp0;
3279:             sum2 -= v2[0] * tmp0;
3280:             sum3 -= v3[0] * tmp0;
3281:           }
3282:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3283:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3284:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3285:           break;
3286:         case 4:
3287:           v2   = a->a + ii[row+1];
3288:           v3   = a->a + ii[row+2];
3289:           v4   = a->a + ii[row+3];
3290:           sum1 = b[row];
3291:           sum2 = b[row+1];
3292:           sum3 = b[row+2];
3293:           sum4 = b[row+3];
3294:           for (n = 0; n<sz-1; n+=2) {
3295:             i1    = idx[0];
3296:             i2    = idx[1];
3297:             idx  += 2;
3298:             tmp0  = x[i1];
3299:             tmp1  = x[i2];
3300:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3304:           }
3305:           if (n == sz-1) {
3306:             tmp0  = x[*idx++];
3307:             sum1 -= v1[0] * tmp0;
3308:             sum2 -= v2[0] * tmp0;
3309:             sum3 -= v3[0] * tmp0;
3310:             sum4 -= v4[0] * tmp0;
3311:             v1++; v2++; v3++; v4++;
3312:           }
3313:           t[row]   = sum1;
3314:           t[row+1] = sum2;
3315:           t[row+2] = sum3;
3316:           t[row+3] = sum4;
3317:           sz      = ii[row+1] - diag[row] - 4;
3318:           idx     = a->j + diag[row] + 4;
3319:           v1 += 4;
3320:           v2 += 4;
3321:           v3 += 4;
3322:           v4 += 4;
3323:           for (n = 0; n<sz-1; n+=2) {
3324:             i1    = idx[0];
3325:             i2    = idx[1];
3326:             idx  += 2;
3327:             tmp0  = x[i1];
3328:             tmp1  = x[i2];
3329:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333:           }
3334:           if (n == sz-1) {
3335:             tmp0  = x[*idx];
3336:             sum1 -= v1[0] * tmp0;
3337:             sum2 -= v2[0] * tmp0;
3338:             sum3 -= v3[0] * tmp0;
3339:             sum4 -= v4[0] * tmp0;
3340:           }
3341:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3342:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3343:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3344:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3345:           break;
3346:         case 5:
3347:           v2   = a->a + ii[row+1];
3348:           v3   = a->a + ii[row+2];
3349:           v4   = a->a + ii[row+3];
3350:           v5   = a->a + ii[row+4];
3351:           sum1 = b[row];
3352:           sum2 = b[row+1];
3353:           sum3 = b[row+2];
3354:           sum4 = b[row+3];
3355:           sum5 = b[row+4];
3356:           for (n = 0; n<sz-1; n+=2) {
3357:             i1    = idx[0];
3358:             i2    = idx[1];
3359:             idx  += 2;
3360:             tmp0  = x[i1];
3361:             tmp1  = x[i2];
3362:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367:           }
3368:           if (n == sz-1) {
3369:             tmp0  = x[*idx++];
3370:             sum1 -= v1[0] * tmp0;
3371:             sum2 -= v2[0] * tmp0;
3372:             sum3 -= v3[0] * tmp0;
3373:             sum4 -= v4[0] * tmp0;
3374:             sum5 -= v5[0] * tmp0;
3375:             v1++; v2++; v3++; v4++; v5++;
3376:           }
3377:           t[row]   = sum1;
3378:           t[row+1] = sum2;
3379:           t[row+2] = sum3;
3380:           t[row+3] = sum4;
3381:           t[row+4] = sum5;
3382:           sz      = ii[row+1] - diag[row] - 5;
3383:           idx     = a->j + diag[row] + 5;
3384:           v1 += 5;
3385:           v2 += 5;
3386:           v3 += 5;
3387:           v4 += 5;
3388:           v5 += 5;
3389:           for (n = 0; n<sz-1; n+=2) {
3390:             i1    = idx[0];
3391:             i2    = idx[1];
3392:             idx  += 2;
3393:             tmp0  = x[i1];
3394:             tmp1  = x[i2];
3395:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3396:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3397:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3398:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3399:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3400:           }
3401:           if (n == sz-1) {
3402:             tmp0  = x[*idx];
3403:             sum1 -= v1[0] * tmp0;
3404:             sum2 -= v2[0] * tmp0;
3405:             sum3 -= v3[0] * tmp0;
3406:             sum4 -= v4[0] * tmp0;
3407:             sum5 -= v5[0] * tmp0;
3408:           }
3409:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3410:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3411:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3412:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3413:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3414:           break;
3415:         default:
3416:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3417:         }
3418:       }
3419:       xb   = t;
3420:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3421:     } else xb = b;

3423:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3425:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3426:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3427:         ibdiag -= sizes[i]*sizes[i];

3429:         /* set RHS */
3430:         if (xb == b) {
3431:           /* whole (old way) */
3432:           sz      = ii[row+1] - ii[row];
3433:           idx     = a->j + ii[row];
3434:           switch (sizes[i]) {
3435:           case 5:
3436:             v5      = a->a + ii[row-4];
3437:           case 4: /* fall through */
3438:             v4      = a->a + ii[row-3];
3439:           case 3:
3440:             v3      = a->a + ii[row-2];
3441:           case 2:
3442:             v2      = a->a + ii[row-1];
3443:           case 1:
3444:             v1      = a->a + ii[row];
3445:             break;
3446:           default:
3447:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3448:           }
3449:         } else {
3450:           /* upper, no diag */
3451:           sz      = ii[row+1] - diag[row] - 1;
3452:           idx     = a->j + diag[row] + 1;
3453:           switch (sizes[i]) {
3454:           case 5:
3455:             v5      = a->a + diag[row-4] + 5;
3456:           case 4: /* fall through */
3457:             v4      = a->a + diag[row-3] + 4;
3458:           case 3:
3459:             v3      = a->a + diag[row-2] + 3;
3460:           case 2:
3461:             v2      = a->a + diag[row-1] + 2;
3462:           case 1:
3463:             v1      = a->a + diag[row] + 1;
3464:           }
3465:         }
3466:         /* set sum */
3467:         switch (sizes[i]) {
3468:         case 5:
3469:           sum5 = xb[row-4];
3470:         case 4: /* fall through */
3471:           sum4 = xb[row-3];
3472:         case 3:
3473:           sum3 = xb[row-2];
3474:         case 2:
3475:           sum2 = xb[row-1];
3476:         case 1:
3477:           /* note that sum1 is associated with the last row */
3478:           sum1 = xb[row];
3479:         }
3480:         /* do sums */
3481:         for (n = 0; n<sz-1; n+=2) {
3482:             i1    = idx[0];
3483:             i2    = idx[1];
3484:             idx  += 2;
3485:             tmp0  = x[i1];
3486:             tmp1  = x[i2];
3487:             switch (sizes[i]) {
3488:             case 5:
3489:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3490:             case 4: /* fall through */
3491:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3492:             case 3:
3493:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3494:             case 2:
3495:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3496:             case 1:
3497:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3498:             }
3499:         }
3500:         /* ragged edge */
3501:         if (n == sz-1) {
3502:           tmp0  = x[*idx];
3503:           switch (sizes[i]) {
3504:           case 5:
3505:             sum5 -= *v5*tmp0;
3506:           case 4: /* fall through */
3507:             sum4 -= *v4*tmp0;
3508:           case 3:
3509:             sum3 -= *v3*tmp0;
3510:           case 2:
3511:             sum2 -= *v2*tmp0;
3512:           case 1:
3513:             sum1 -= *v1*tmp0;
3514:           }
3515:         }
3516:         /* update */
3517:         if (xb == b) {
3518:           /* whole (old way) w/ diag */
3519:           switch (sizes[i]) {
3520:           case 5:
3521:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3522:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3523:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3524:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3525:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3526:             break;
3527:           case 4:
3528:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3529:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3530:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3531:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3532:             break;
3533:           case 3:
3534:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3535:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3536:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3537:             break;
3538:           case 2:
3539:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3540:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3541:             break;
3542:           case 1:
3543:             x[row--] += sum1*(*ibdiag);
3544:             break;
3545:           }
3546:         } else {
3547:           /* no diag so set =  */
3548:           switch (sizes[i]) {
3549:           case 5:
3550:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555:             break;
3556:           case 4:
3557:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561:             break;
3562:           case 3:
3563:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566:             break;
3567:           case 2:
3568:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3569:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3570:             break;
3571:           case 1:
3572:             x[row--] = sum1*(*ibdiag);
3573:             break;
3574:           }
3575:         }
3576:       }
3577:       if (xb == b) {
3578:         PetscLogFlops(2.0*a->nz);
3579:       } else {
3580:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3581:       }
3582:     }
3583:   }
3584:   if (flag & SOR_EISENSTAT) {
3585:     /*
3586:           Apply  (U + D)^-1  where D is now the block diagonal
3587:     */
3588:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3589:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3590:       ibdiag -= sizes[i]*sizes[i];
3591:       sz      = ii[row+1] - diag[row] - 1;
3592:       v1      = a->a + diag[row] + 1;
3593:       idx     = a->j + diag[row] + 1;
3594:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3595:       switch (sizes[i]) {
3596:       case 1:

3598:         sum1 = b[row];
3599:         for (n = 0; n<sz-1; n+=2) {
3600:           i1    = idx[0];
3601:           i2    = idx[1];
3602:           idx  += 2;
3603:           tmp0  = x[i1];
3604:           tmp1  = x[i2];
3605:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606:         }

3608:         if (n == sz-1) {
3609:           tmp0  = x[*idx];
3610:           sum1 -= *v1*tmp0;
3611:         }
3612:         x[row] = sum1*(*ibdiag);row--;
3613:         break;

3615:       case 2:

3617:         sum1 = b[row];
3618:         sum2 = b[row-1];
3619:         /* note that sum1 is associated with the second of the two rows */
3620:         v2 = a->a + diag[row-1] + 2;
3621:         for (n = 0; n<sz-1; n+=2) {
3622:           i1    = idx[0];
3623:           i2    = idx[1];
3624:           idx  += 2;
3625:           tmp0  = x[i1];
3626:           tmp1  = x[i2];
3627:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3628:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3629:         }

3631:         if (n == sz-1) {
3632:           tmp0  = x[*idx];
3633:           sum1 -= *v1*tmp0;
3634:           sum2 -= *v2*tmp0;
3635:         }
3636:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3637:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3638:         row     -= 2;
3639:         break;
3640:       case 3:

3642:         sum1 = b[row];
3643:         sum2 = b[row-1];
3644:         sum3 = b[row-2];
3645:         v2   = a->a + diag[row-1] + 2;
3646:         v3   = a->a + diag[row-2] + 3;
3647:         for (n = 0; n<sz-1; n+=2) {
3648:           i1    = idx[0];
3649:           i2    = idx[1];
3650:           idx  += 2;
3651:           tmp0  = x[i1];
3652:           tmp1  = x[i2];
3653:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3654:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3655:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3656:         }

3658:         if (n == sz-1) {
3659:           tmp0  = x[*idx];
3660:           sum1 -= *v1*tmp0;
3661:           sum2 -= *v2*tmp0;
3662:           sum3 -= *v3*tmp0;
3663:         }
3664:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3665:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3666:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3667:         row     -= 3;
3668:         break;
3669:       case 4:

3671:         sum1 = b[row];
3672:         sum2 = b[row-1];
3673:         sum3 = b[row-2];
3674:         sum4 = b[row-3];
3675:         v2   = a->a + diag[row-1] + 2;
3676:         v3   = a->a + diag[row-2] + 3;
3677:         v4   = a->a + diag[row-3] + 4;
3678:         for (n = 0; n<sz-1; n+=2) {
3679:           i1    = idx[0];
3680:           i2    = idx[1];
3681:           idx  += 2;
3682:           tmp0  = x[i1];
3683:           tmp1  = x[i2];
3684:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3685:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3686:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3687:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3688:         }

3690:         if (n == sz-1) {
3691:           tmp0  = x[*idx];
3692:           sum1 -= *v1*tmp0;
3693:           sum2 -= *v2*tmp0;
3694:           sum3 -= *v3*tmp0;
3695:           sum4 -= *v4*tmp0;
3696:         }
3697:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3698:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3699:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3700:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3701:         row     -= 4;
3702:         break;
3703:       case 5:

3705:         sum1 = b[row];
3706:         sum2 = b[row-1];
3707:         sum3 = b[row-2];
3708:         sum4 = b[row-3];
3709:         sum5 = b[row-4];
3710:         v2   = a->a + diag[row-1] + 2;
3711:         v3   = a->a + diag[row-2] + 3;
3712:         v4   = a->a + diag[row-3] + 4;
3713:         v5   = a->a + diag[row-4] + 5;
3714:         for (n = 0; n<sz-1; n+=2) {
3715:           i1    = idx[0];
3716:           i2    = idx[1];
3717:           idx  += 2;
3718:           tmp0  = x[i1];
3719:           tmp1  = x[i2];
3720:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3725:         }

3727:         if (n == sz-1) {
3728:           tmp0  = x[*idx];
3729:           sum1 -= *v1*tmp0;
3730:           sum2 -= *v2*tmp0;
3731:           sum3 -= *v3*tmp0;
3732:           sum4 -= *v4*tmp0;
3733:           sum5 -= *v5*tmp0;
3734:         }
3735:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3736:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3737:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3738:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3739:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3740:         row     -= 5;
3741:         break;
3742:       default:
3743:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3744:       }
3745:     }
3746:     PetscLogFlops(a->nz);

3748:     /*
3749:            t = b - D x    where D is the block diagonal
3750:     */
3751:     cnt = 0;
3752:     for (i=0, row=0; i<m; i++) {
3753:       switch (sizes[i]) {
3754:       case 1:
3755:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3756:         break;
3757:       case 2:
3758:         x1       = x[row]; x2 = x[row+1];
3759:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3760:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3761:         t[row]   = b[row] - tmp1;
3762:         t[row+1] = b[row+1] - tmp2; row += 2;
3763:         cnt     += 4;
3764:         break;
3765:       case 3:
3766:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3767:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3768:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3769:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3770:         t[row]   = b[row] - tmp1;
3771:         t[row+1] = b[row+1] - tmp2;
3772:         t[row+2] = b[row+2] - tmp3; row += 3;
3773:         cnt     += 9;
3774:         break;
3775:       case 4:
3776:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3777:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3778:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3779:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3780:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3781:         t[row]   = b[row] - tmp1;
3782:         t[row+1] = b[row+1] - tmp2;
3783:         t[row+2] = b[row+2] - tmp3;
3784:         t[row+3] = b[row+3] - tmp4; row += 4;
3785:         cnt     += 16;
3786:         break;
3787:       case 5:
3788:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3789:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3790:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3791:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3792:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3793:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3794:         t[row]   = b[row] - tmp1;
3795:         t[row+1] = b[row+1] - tmp2;
3796:         t[row+2] = b[row+2] - tmp3;
3797:         t[row+3] = b[row+3] - tmp4;
3798:         t[row+4] = b[row+4] - tmp5;row += 5;
3799:         cnt     += 25;
3800:         break;
3801:       default:
3802:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3803:       }
3804:     }
3805:     PetscLogFlops(m);



3809:     /*
3810:           Apply (L + D)^-1 where D is the block diagonal
3811:     */
3812:     for (i=0, row=0; i<m; i++) {
3813:       sz  = diag[row] - ii[row];
3814:       v1  = a->a + ii[row];
3815:       idx = a->j + ii[row];
3816:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3817:       switch (sizes[i]) {
3818:       case 1:

3820:         sum1 = t[row];
3821:         for (n = 0; n<sz-1; n+=2) {
3822:           i1    = idx[0];
3823:           i2    = idx[1];
3824:           idx  += 2;
3825:           tmp0  = t[i1];
3826:           tmp1  = t[i2];
3827:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3828:         }

3830:         if (n == sz-1) {
3831:           tmp0  = t[*idx];
3832:           sum1 -= *v1 * tmp0;
3833:         }
3834:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3835:         break;
3836:       case 2:
3837:         v2   = a->a + ii[row+1];
3838:         sum1 = t[row];
3839:         sum2 = t[row+1];
3840:         for (n = 0; n<sz-1; n+=2) {
3841:           i1    = idx[0];
3842:           i2    = idx[1];
3843:           idx  += 2;
3844:           tmp0  = t[i1];
3845:           tmp1  = t[i2];
3846:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3847:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3848:         }

3850:         if (n == sz-1) {
3851:           tmp0  = t[*idx];
3852:           sum1 -= v1[0] * tmp0;
3853:           sum2 -= v2[0] * tmp0;
3854:         }
3855:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3856:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3857:         ibdiag   += 4; row += 2;
3858:         break;
3859:       case 3:
3860:         v2   = a->a + ii[row+1];
3861:         v3   = a->a + ii[row+2];
3862:         sum1 = t[row];
3863:         sum2 = t[row+1];
3864:         sum3 = t[row+2];
3865:         for (n = 0; n<sz-1; n+=2) {
3866:           i1    = idx[0];
3867:           i2    = idx[1];
3868:           idx  += 2;
3869:           tmp0  = t[i1];
3870:           tmp1  = t[i2];
3871:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3872:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3873:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3874:         }

3876:         if (n == sz-1) {
3877:           tmp0  = t[*idx];
3878:           sum1 -= v1[0] * tmp0;
3879:           sum2 -= v2[0] * tmp0;
3880:           sum3 -= v3[0] * tmp0;
3881:         }
3882:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3883:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3884:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3885:         ibdiag   += 9; row += 3;
3886:         break;
3887:       case 4:
3888:         v2   = a->a + ii[row+1];
3889:         v3   = a->a + ii[row+2];
3890:         v4   = a->a + ii[row+3];
3891:         sum1 = t[row];
3892:         sum2 = t[row+1];
3893:         sum3 = t[row+2];
3894:         sum4 = t[row+3];
3895:         for (n = 0; n<sz-1; n+=2) {
3896:           i1    = idx[0];
3897:           i2    = idx[1];
3898:           idx  += 2;
3899:           tmp0  = t[i1];
3900:           tmp1  = t[i2];
3901:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3902:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3903:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3904:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3905:         }

3907:         if (n == sz-1) {
3908:           tmp0  = t[*idx];
3909:           sum1 -= v1[0] * tmp0;
3910:           sum2 -= v2[0] * tmp0;
3911:           sum3 -= v3[0] * tmp0;
3912:           sum4 -= v4[0] * tmp0;
3913:         }
3914:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3915:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3916:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3917:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3918:         ibdiag   += 16; row += 4;
3919:         break;
3920:       case 5:
3921:         v2   = a->a + ii[row+1];
3922:         v3   = a->a + ii[row+2];
3923:         v4   = a->a + ii[row+3];
3924:         v5   = a->a + ii[row+4];
3925:         sum1 = t[row];
3926:         sum2 = t[row+1];
3927:         sum3 = t[row+2];
3928:         sum4 = t[row+3];
3929:         sum5 = t[row+4];
3930:         for (n = 0; n<sz-1; n+=2) {
3931:           i1    = idx[0];
3932:           i2    = idx[1];
3933:           idx  += 2;
3934:           tmp0  = t[i1];
3935:           tmp1  = t[i2];
3936:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3937:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3938:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3939:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3940:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3941:         }

3943:         if (n == sz-1) {
3944:           tmp0  = t[*idx];
3945:           sum1 -= v1[0] * tmp0;
3946:           sum2 -= v2[0] * tmp0;
3947:           sum3 -= v3[0] * tmp0;
3948:           sum4 -= v4[0] * tmp0;
3949:           sum5 -= v5[0] * tmp0;
3950:         }
3951:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3952:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3953:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3954:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3955:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3956:         ibdiag   += 25; row += 5;
3957:         break;
3958:       default:
3959:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3960:       }
3961:     }
3962:     PetscLogFlops(a->nz);
3963:   }
3964:   VecRestoreArray(xx,&x);
3965:   VecRestoreArrayRead(bb,&b);
3966:   return(0);
3967: }

3969: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3970: {
3971:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3972:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3973:   const MatScalar   *bdiag = a->inode.bdiag;
3974:   const PetscScalar *b;
3975:   PetscErrorCode    ierr;
3976:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3977:   const PetscInt    *sizes = a->inode.size;

3980:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3981:   VecGetArray(xx,&x);
3982:   VecGetArrayRead(bb,&b);
3983:   cnt  = 0;
3984:   for (i=0, row=0; i<m; i++) {
3985:     switch (sizes[i]) {
3986:     case 1:
3987:       x[row] = b[row]*bdiag[cnt++];row++;
3988:       break;
3989:     case 2:
3990:       x1       = b[row]; x2 = b[row+1];
3991:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3992:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3993:       x[row++] = tmp1;
3994:       x[row++] = tmp2;
3995:       cnt     += 4;
3996:       break;
3997:     case 3:
3998:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3999:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4000:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4001:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4002:       x[row++] = tmp1;
4003:       x[row++] = tmp2;
4004:       x[row++] = tmp3;
4005:       cnt     += 9;
4006:       break;
4007:     case 4:
4008:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4009:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4010:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4011:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4012:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4013:       x[row++] = tmp1;
4014:       x[row++] = tmp2;
4015:       x[row++] = tmp3;
4016:       x[row++] = tmp4;
4017:       cnt     += 16;
4018:       break;
4019:     case 5:
4020:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4021:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4022:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4023:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4024:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4025:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4026:       x[row++] = tmp1;
4027:       x[row++] = tmp2;
4028:       x[row++] = tmp3;
4029:       x[row++] = tmp4;
4030:       x[row++] = tmp5;
4031:       cnt     += 25;
4032:       break;
4033:     default:
4034:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4035:     }
4036:   }
4037:   PetscLogFlops(2.0*cnt);
4038:   VecRestoreArray(xx,&x);
4039:   VecRestoreArrayRead(bb,&b);
4040:   return(0);
4041: }

4043: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4044: {
4045:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4048:   a->inode.node_count       = 0;
4049:   a->inode.use              = PETSC_FALSE;
4050:   a->inode.checked          = PETSC_FALSE;
4051:   a->inode.mat_nonzerostate = -1;
4052:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4053:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4054:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4055:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4056:   A->ops->coloringpatch     = NULL;
4057:   A->ops->multdiagonalblock = NULL;
4058:   if (A->factortype) {
4059:     A->ops->solve = MatSolve_SeqAIJ_inplace;
4060:   }
4061:   return(0);
4062: }

4064: /*
4065:     samestructure indicates that the matrix has not changed its nonzero structure so we
4066:     do not need to recompute the inodes
4067: */
4068: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4069: {
4070:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4072:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4073:   PetscBool      flag;
4074:   const PetscInt *idx,*idy,*ii;

4077:   if (!a->inode.use) {
4078:     MatSeqAIJ_Inode_ResetOps(A);
4079:     PetscFree(a->inode.size);
4080:     return(0);
4081:   }
4082:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4084:   m = A->rmap->n;
4085:   if (!a->inode.size) { PetscMalloc1(m+1,&a->inode.size); }
4086:   ns = a->inode.size;

4088:   i          = 0;
4089:   node_count = 0;
4090:   idx        = a->j;
4091:   ii         = a->i;
4092:   while (i < m) {                /* For each row */
4093:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4094:     /* Limits the number of elements in a node to 'a->inode.limit' */
4095:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4096:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4097:       if (nzy != nzx) break;
4098:       idy += nzx;              /* Same nonzero pattern */
4099:       PetscArraycmp(idx,idy,nzx,&flag);
4100:       if (!flag) break;
4101:     }
4102:     ns[node_count++] = blk_size;
4103:     idx             += blk_size*nzx;
4104:     i                = j;
4105:   }

4107:   /* If not enough inodes found,, do not use inode version of the routines */
4108:   if (!m || node_count > .8*m) {
4109:     MatSeqAIJ_Inode_ResetOps(A);
4110:     PetscFree(a->inode.size);
4111:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4112:   } else {
4113:     if (!A->factortype) {
4114:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4115:       if (A->rmap->n == A->cmap->n) {
4116:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4117:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4118:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4119:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4120:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4121:       }
4122:     } else {
4123:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4124:     }
4125:     a->inode.node_count = node_count;
4126:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4127:   }
4128:   a->inode.checked          = PETSC_TRUE;
4129:   a->inode.mat_nonzerostate = A->nonzerostate;
4130:   return(0);
4131: }

4133: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4134: {
4135:   Mat            B =*C;
4136:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4138:   PetscInt       m=A->rmap->n;

4141:   c->inode.use              = a->inode.use;
4142:   c->inode.limit            = a->inode.limit;
4143:   c->inode.max_limit        = a->inode.max_limit;
4144:   c->inode.checked          = PETSC_FALSE;
4145:   c->inode.size             = NULL;
4146:   c->inode.node_count       = 0;
4147:   c->inode.ibdiagvalid      = PETSC_FALSE;
4148:   c->inode.ibdiag           = NULL;
4149:   c->inode.bdiag            = NULL;
4150:   c->inode.mat_nonzerostate = -1;
4151:   if (a->inode.use) {
4152:     if (a->inode.checked && a->inode.size) {
4153:       PetscMalloc1(m+1,&c->inode.size);
4154:       PetscArraycpy(c->inode.size,a->inode.size,m+1);

4156:       c->inode.checked          = PETSC_TRUE;
4157:       c->inode.node_count       = a->inode.node_count;
4158:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4159:     }
4160:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4161:     if (!B->factortype) {
4162:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4163:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4164:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4165:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4166:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4167:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4168:     } else {
4169:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4170:     }
4171:   }
4172:   return(0);
4173: }

4175: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4176: {
4177:   PetscInt       k;
4178:   const PetscInt *vi;

4181:   vi = aj + ai[row];
4182:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4183:   vi        = aj + adiag[row];
4184:   cols[nzl] = vi[0];
4185:   vi        = aj + adiag[row+1]+1;
4186:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4187:   return(0);
4188: }
4189: /*
4190:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4191:    Modified from MatSeqAIJCheckInode().

4193:    Input Parameters:
4194: .  Mat A - ILU or LU matrix factor

4196: */
4197: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4198: {
4199:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4201:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4202:   PetscInt       *cols1,*cols2,*ns;
4203:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4204:   PetscBool      flag;

4207:   if (!a->inode.use)    return(0);
4208:   if (a->inode.checked) return(0);

4210:   m = A->rmap->n;
4211:   if (a->inode.size) ns = a->inode.size;
4212:   else {
4213:     PetscMalloc1(m+1,&ns);
4214:   }

4216:   i          = 0;
4217:   node_count = 0;
4218:   PetscMalloc2(m,&cols1,m,&cols2);
4219:   while (i < m) {                /* For each row */
4220:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4221:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4222:     nzx  = nzl1 + nzu1 + 1;
4223:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4225:     /* Limits the number of elements in a node to 'a->inode.limit' */
4226:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4227:       nzl2 = ai[j+1] - ai[j];
4228:       nzu2 = adiag[j] - adiag[j+1] - 1;
4229:       nzy  = nzl2 + nzu2 + 1;
4230:       if (nzy != nzx) break;
4231:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4232:       PetscArraycmp(cols1,cols2,nzx,&flag);
4233:       if (!flag) break;
4234:     }
4235:     ns[node_count++] = blk_size;
4236:     i                = j;
4237:   }
4238:   PetscFree2(cols1,cols2);
4239:   /* If not enough inodes found,, do not use inode version of the routines */
4240:   if (!m || node_count > .8*m) {
4241:     PetscFree(ns);

4243:     a->inode.node_count = 0;
4244:     a->inode.size       = NULL;
4245:     a->inode.use        = PETSC_FALSE;

4247:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4248:   } else {
4249:     A->ops->mult              = NULL;
4250:     A->ops->sor               = NULL;
4251:     A->ops->multadd           = NULL;
4252:     A->ops->getrowij          = NULL;
4253:     A->ops->restorerowij      = NULL;
4254:     A->ops->getcolumnij       = NULL;
4255:     A->ops->restorecolumnij   = NULL;
4256:     A->ops->coloringpatch     = NULL;
4257:     A->ops->multdiagonalblock = NULL;
4258:     a->inode.node_count       = node_count;
4259:     a->inode.size             = ns;

4261:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4262:   }
4263:   a->inode.checked = PETSC_TRUE;
4264:   return(0);
4265: }

4267: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4268: {
4269:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4272:   a->inode.ibdiagvalid = PETSC_FALSE;
4273:   return(0);
4274: }

4276: /*
4277:      This is really ugly. if inodes are used this replaces the
4278:   permutations with ones that correspond to rows/cols of the matrix
4279:   rather then inode blocks
4280: */
4281: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4282: {

4286:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4287:   return(0);
4288: }

4290: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4291: {
4292:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4294:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4295:   const PetscInt *ridx,*cidx;
4296:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4297:   PetscInt       nslim_col,*ns_col;
4298:   IS             ris = *rperm,cis = *cperm;

4301:   if (!a->inode.size) return(0); /* no inodes so return */
4302:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4304:   MatCreateColInode_Private(A,&nslim_col,&ns_col);
4305:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4306:   PetscMalloc2(m,&permr,n,&permc);

4308:   ISGetIndices(ris,&ridx);
4309:   ISGetIndices(cis,&cidx);

4311:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4312:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4314:   /* Construct the permutations for rows*/
4315:   for (i=0,row = 0; i<nslim_row; ++i) {
4316:     indx      = ridx[i];
4317:     start_val = tns[indx];
4318:     end_val   = tns[indx + 1];
4319:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4320:   }

4322:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4323:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4325:   /* Construct permutations for columns */
4326:   for (i=0,col=0; i<nslim_col; ++i) {
4327:     indx      = cidx[i];
4328:     start_val = tns[indx];
4329:     end_val   = tns[indx + 1];
4330:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4331:   }

4333:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4334:   ISSetPermutation(*rperm);
4335:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4336:   ISSetPermutation(*cperm);

4338:   ISRestoreIndices(ris,&ridx);
4339:   ISRestoreIndices(cis,&cidx);

4341:   PetscFree(ns_col);
4342:   PetscFree2(permr,permc);
4343:   ISDestroy(&cis);
4344:   ISDestroy(&ris);
4345:   PetscFree(tns);
4346:   return(0);
4347: }

4349: /*@C
4350:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4352:    Not Collective

4354:    Input Parameter:
4355: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4357:    Output Parameter:
4358: +  node_count - no of inodes present in the matrix.
4359: .  sizes      - an array of size node_count,with sizes of each inode.
4360: -  limit      - the max size used to generate the inodes.

4362:    Level: advanced

4364:    Notes:
4365:     This routine returns some internal storage information
4366:    of the matrix, it is intended to be used by advanced users.
4367:    It should be called after the matrix is assembled.
4368:    The contents of the sizes[] array should not be changed.
4369:    NULL may be passed for information not requested.

4371: .seealso: MatGetInfo()
4372: @*/
4373: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4374: {
4375:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4378:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4379:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4380:   if (f) {
4381:     (*f)(A,node_count,sizes,limit);
4382:   }
4383:   return(0);
4384: }

4386: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4387: {
4388:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4391:   if (node_count) *node_count = a->inode.node_count;
4392:   if (sizes)      *sizes      = a->inode.size;
4393:   if (limit)      *limit      = a->inode.limit;
4394:   return(0);
4395: }