petsc-3.12.5 2020-03-29

  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6:  #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 12:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 15:   n      = A->cmap->n;
 16:   m      = A->rmap->n;
 17:   ns_row = a->inode.size;

 19:   min_mn = (m < n) ? m : n;
 20:   if (!ns) {
 21:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 22:     for (; count+1 < n; count++,i++) ;
 23:     if (count < n)  {
 24:       i++;
 25:     }
 26:     *size = i;
 27:     return(0);
 28:   }
 29:   PetscMalloc1(n+1,&ns_col);

 31:   /* Use the same row structure wherever feasible. */
 32:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 33:     ns_col[i] = ns_row[i];
 34:   }

 36:   /* if m < n; pad up the remainder with inode_limit */
 37:   for (; count+1 < n; count++,i++) {
 38:     ns_col[i] = 1;
 39:   }
 40:   /* The last node is the odd ball. padd it up with the remaining rows; */
 41:   if (count < n) {
 42:     ns_col[i] = n - count;
 43:     i++;
 44:   } else if (count > n) {
 45:     /* Adjust for the over estimation */
 46:     ns_col[i-1] += n - count;
 47:   }
 48:   *size = i;
 49:   *ns   = ns_col;
 50:   return(0);
 51: }


 54: /*
 55:       This builds symmetric version of nonzero structure,
 56: */
 57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 58: {
 59:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 61:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 62:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 63:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 66:   nslim_row = a->inode.node_count;
 67:   m         = A->rmap->n;
 68:   n         = A->cmap->n;
 69:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 71:   /* Use the row_inode as column_inode */
 72:   nslim_col = nslim_row;
 73:   ns_col    = ns_row;

 75:   /* allocate space for reformated inode structure */
 76:   PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
 77:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 79:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 80:     nsz = ns_col[i1];
 81:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 82:   }
 83:   /* allocate space for row pointers */
 84:   PetscCalloc1(nslim_row+1,&ia);
 85:   *iia = ia;
 86:   PetscMalloc1(nslim_row+1,&work);

 88:   /* determine the number of columns in each row */
 89:   ia[0] = oshift;
 90:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 92:     j    = aj + ai[row] + ishift;
 93:     jmax = aj + ai[row+1] + ishift;
 94:     if (j==jmax) continue; /* empty row */
 95:     col  = *j++ + ishift;
 96:     i2   = tvc[col];
 97:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
 98:       ia[i1+1]++;
 99:       ia[i2+1]++;
100:       i2++;                     /* Start col of next node */
101:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
102:       i2 = tvc[col];
103:     }
104:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
105:   }

107:   /* shift ia[i] to point to next row */
108:   for (i1=1; i1<nslim_row+1; i1++) {
109:     row        = ia[i1-1];
110:     ia[i1]    += row;
111:     work[i1-1] = row - oshift;
112:   }

114:   /* allocate space for column pointers */
115:   nz   = ia[nslim_row] + (!ishift);
116:   PetscMalloc1(nz,&ja);
117:   *jja = ja;

119:   /* loop over lower triangular part putting into ja */
120:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
121:     j    = aj + ai[row] + ishift;
122:     jmax = aj + ai[row+1] + ishift;
123:     if (j==jmax) continue; /* empty row */
124:     col  = *j++ + ishift;
125:     i2   = tvc[col];
126:     while (i2<i1 && j<jmax) {
127:       ja[work[i2]++] = i1 + oshift;
128:       ja[work[i1]++] = i2 + oshift;
129:       ++i2;
130:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
131:       i2 = tvc[col];
132:     }
133:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

135:   }
136:   PetscFree(work);
137:   PetscFree2(tns,tvc);
138:   return(0);
139: }

141: /*
142:       This builds nonsymmetric version of nonzero structure,
143: */
144: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
145: {
146:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
148:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
149:   PetscInt       *tns,*tvc,nsz,i1,i2;
150:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

153:   nslim_row = a->inode.node_count;
154:   n         = A->cmap->n;

156:   /* Create The column_inode for this matrix */
157:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

159:   /* allocate space for reformated column_inode structure */
160:   PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
161:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

163:   for (i1=0,col=0; i1<nslim_col; ++i1) {
164:     nsz = ns_col[i1];
165:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
166:   }
167:   /* allocate space for row pointers */
168:   PetscCalloc1(nslim_row+1,&ia);
169:   *iia = ia;
170:   PetscMalloc1(nslim_row+1,&work);

172:   /* determine the number of columns in each row */
173:   ia[0] = oshift;
174:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
175:     j   = aj + ai[row] + ishift;
176:     nz  = ai[row+1] - ai[row];
177:     if (!nz) continue; /* empty row */
178:     col = *j++ + ishift;
179:     i2  = tvc[col];
180:     while (nz-- > 0) {           /* off-diagonal elemets */
181:       ia[i1+1]++;
182:       i2++;                     /* Start col of next node */
183:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
184:       if (nz > 0) i2 = tvc[col];
185:     }
186:   }

188:   /* shift ia[i] to point to next row */
189:   for (i1=1; i1<nslim_row+1; i1++) {
190:     row        = ia[i1-1];
191:     ia[i1]    += row;
192:     work[i1-1] = row - oshift;
193:   }

195:   /* allocate space for column pointers */
196:   nz   = ia[nslim_row] + (!ishift);
197:   PetscMalloc1(nz,&ja);
198:   *jja = ja;

200:   /* loop over matrix putting into ja */
201:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
202:     j   = aj + ai[row] + ishift;
203:     nz  = ai[row+1] - ai[row];
204:     if (!nz) continue; /* empty row */
205:     col = *j++ + ishift;
206:     i2  = tvc[col];
207:     while (nz-- > 0) {
208:       ja[work[i1]++] = i2 + oshift;
209:       ++i2;
210:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
211:       if (nz > 0) i2 = tvc[col];
212:     }
213:   }
214:   PetscFree(ns_col);
215:   PetscFree(work);
216:   PetscFree2(tns,tvc);
217:   return(0);
218: }

220: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
221: {
222:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

226:   *n = a->inode.node_count;
227:   if (!ia) return(0);
228:   if (!blockcompressed) {
229:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
230:   } else if (symmetric) {
231:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
232:   } else {
233:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
234:   }
235:   return(0);
236: }

238: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
239: {

243:   if (!ia) return(0);

245:   if (!blockcompressed) {
246:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
247:   } else {
248:     PetscFree(*ia);
249:     PetscFree(*ja);
250:   }
251:   return(0);
252: }

254: /* ----------------------------------------------------------- */

256: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
257: {
258:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
260:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
261:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

264:   nslim_row = a->inode.node_count;
265:   n         = A->cmap->n;

267:   /* Create The column_inode for this matrix */
268:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

270:   /* allocate space for reformated column_inode structure */
271:   PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
272:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

274:   for (i1=0,col=0; i1<nslim_col; ++i1) {
275:     nsz = ns_col[i1];
276:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
277:   }
278:   /* allocate space for column pointers */
279:   PetscCalloc1(nslim_col+1,&ia);
280:   *iia = ia;
281:   PetscMalloc1(nslim_col+1,&work);

283:   /* determine the number of columns in each row */
284:   ia[0] = oshift;
285:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
286:     j   = aj + ai[row] + ishift;
287:     col = *j++ + ishift;
288:     i2  = tvc[col];
289:     nz  = ai[row+1] - ai[row];
290:     while (nz-- > 0) {           /* off-diagonal elemets */
291:       /* ia[i1+1]++; */
292:       ia[i2+1]++;
293:       i2++;
294:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
295:       if (nz > 0) i2 = tvc[col];
296:     }
297:   }

299:   /* shift ia[i] to point to next col */
300:   for (i1=1; i1<nslim_col+1; i1++) {
301:     col        = ia[i1-1];
302:     ia[i1]    += col;
303:     work[i1-1] = col - oshift;
304:   }

306:   /* allocate space for column pointers */
307:   nz   = ia[nslim_col] + (!ishift);
308:   PetscMalloc1(nz,&ja);
309:   *jja = ja;

311:   /* loop over matrix putting into ja */
312:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
313:     j   = aj + ai[row] + ishift;
314:     col = *j++ + ishift;
315:     i2  = tvc[col];
316:     nz  = ai[row+1] - ai[row];
317:     while (nz-- > 0) {
318:       /* ja[work[i1]++] = i2 + oshift; */
319:       ja[work[i2]++] = i1 + oshift;
320:       i2++;
321:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
322:       if (nz > 0) i2 = tvc[col];
323:     }
324:   }
325:   PetscFree(ns_col);
326:   PetscFree(work);
327:   PetscFree2(tns,tvc);
328:   return(0);
329: }

331: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
332: {

336:   MatCreateColInode_Private(A,n,NULL);
337:   if (!ia) return(0);

339:   if (!blockcompressed) {
340:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
341:   } else if (symmetric) {
342:     /* Since the indices are symmetric it does'nt matter */
343:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
344:   } else {
345:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
346:   }
347:   return(0);
348: }

350: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351: {

355:   if (!ia) return(0);
356:   if (!blockcompressed) {
357:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
358:   } else {
359:     PetscFree(*ia);
360:     PetscFree(*ja);
361:   }
362:   return(0);
363: }

365: /* ----------------------------------------------------------- */

367: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
368: {
369:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
370:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
371:   PetscScalar       *y;
372:   const PetscScalar *x;
373:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
374:   PetscErrorCode    ierr;
375:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
376:   const PetscInt    *idx,*ns,*ii;

378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
380: #endif

383:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
384:   node_max = a->inode.node_count;
385:   ns       = a->inode.size;     /* Node Size array */
386:   VecGetArrayRead(xx,&x);
387:   VecGetArray(yy,&y);
388:   idx      = a->j;
389:   v1       = a->a;
390:   ii       = a->i;

392:   for (i = 0,row = 0; i< node_max; ++i) {
393:     nsz         = ns[i];
394:     n           = ii[1] - ii[0];
395:     nonzerorow += (n>0)*nsz;
396:     ii         += nsz;
397:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
398:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
399:     sz = n;                     /* No of non zeros in this row */
400:                                 /* Switch on the size of Node */
401:     switch (nsz) {               /* Each loop in 'case' is unrolled */
402:     case 1:
403:       sum1 = 0.;

405:       for (n = 0; n< sz-1; n+=2) {
406:         i1    = idx[0];         /* The instructions are ordered to */
407:         i2    = idx[1];         /* make the compiler's job easy */
408:         idx  += 2;
409:         tmp0  = x[i1];
410:         tmp1  = x[i2];
411:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
412:       }

414:       if (n == sz-1) {          /* Take care of the last nonzero  */
415:         tmp0  = x[*idx++];
416:         sum1 += *v1++ *tmp0;
417:       }
418:       y[row++]=sum1;
419:       break;
420:     case 2:
421:       sum1 = 0.;
422:       sum2 = 0.;
423:       v2   = v1 + n;

425:       for (n = 0; n< sz-1; n+=2) {
426:         i1    = idx[0];
427:         i2    = idx[1];
428:         idx  += 2;
429:         tmp0  = x[i1];
430:         tmp1  = x[i2];
431:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
433:       }
434:       if (n == sz-1) {
435:         tmp0  = x[*idx++];
436:         sum1 += *v1++ * tmp0;
437:         sum2 += *v2++ * tmp0;
438:       }
439:       y[row++]=sum1;
440:       y[row++]=sum2;
441:       v1      =v2;              /* Since the next block to be processed starts there*/
442:       idx    +=sz;
443:       break;
444:     case 3:
445:       sum1 = 0.;
446:       sum2 = 0.;
447:       sum3 = 0.;
448:       v2   = v1 + n;
449:       v3   = v2 + n;

451:       for (n = 0; n< sz-1; n+=2) {
452:         i1    = idx[0];
453:         i2    = idx[1];
454:         idx  += 2;
455:         tmp0  = x[i1];
456:         tmp1  = x[i2];
457:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
458:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
459:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
460:       }
461:       if (n == sz-1) {
462:         tmp0  = x[*idx++];
463:         sum1 += *v1++ * tmp0;
464:         sum2 += *v2++ * tmp0;
465:         sum3 += *v3++ * tmp0;
466:       }
467:       y[row++]=sum1;
468:       y[row++]=sum2;
469:       y[row++]=sum3;
470:       v1      =v3;              /* Since the next block to be processed starts there*/
471:       idx    +=2*sz;
472:       break;
473:     case 4:
474:       sum1 = 0.;
475:       sum2 = 0.;
476:       sum3 = 0.;
477:       sum4 = 0.;
478:       v2   = v1 + n;
479:       v3   = v2 + n;
480:       v4   = v3 + n;

482:       for (n = 0; n< sz-1; n+=2) {
483:         i1    = idx[0];
484:         i2    = idx[1];
485:         idx  += 2;
486:         tmp0  = x[i1];
487:         tmp1  = x[i2];
488:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
489:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
490:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
491:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
492:       }
493:       if (n == sz-1) {
494:         tmp0  = x[*idx++];
495:         sum1 += *v1++ * tmp0;
496:         sum2 += *v2++ * tmp0;
497:         sum3 += *v3++ * tmp0;
498:         sum4 += *v4++ * tmp0;
499:       }
500:       y[row++]=sum1;
501:       y[row++]=sum2;
502:       y[row++]=sum3;
503:       y[row++]=sum4;
504:       v1      =v4;              /* Since the next block to be processed starts there*/
505:       idx    +=3*sz;
506:       break;
507:     case 5:
508:       sum1 = 0.;
509:       sum2 = 0.;
510:       sum3 = 0.;
511:       sum4 = 0.;
512:       sum5 = 0.;
513:       v2   = v1 + n;
514:       v3   = v2 + n;
515:       v4   = v3 + n;
516:       v5   = v4 + n;

518:       for (n = 0; n<sz-1; n+=2) {
519:         i1    = idx[0];
520:         i2    = idx[1];
521:         idx  += 2;
522:         tmp0  = x[i1];
523:         tmp1  = x[i2];
524:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
525:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
526:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
527:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
528:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
529:       }
530:       if (n == sz-1) {
531:         tmp0  = x[*idx++];
532:         sum1 += *v1++ * tmp0;
533:         sum2 += *v2++ * tmp0;
534:         sum3 += *v3++ * tmp0;
535:         sum4 += *v4++ * tmp0;
536:         sum5 += *v5++ * tmp0;
537:       }
538:       y[row++]=sum1;
539:       y[row++]=sum2;
540:       y[row++]=sum3;
541:       y[row++]=sum4;
542:       y[row++]=sum5;
543:       v1      =v5;       /* Since the next block to be processed starts there */
544:       idx    +=4*sz;
545:       break;
546:     default:
547:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
548:     }
549:   }
550:   VecRestoreArrayRead(xx,&x);
551:   VecRestoreArray(yy,&y);
552:   PetscLogFlops(2.0*a->nz - nonzerorow);
553:   return(0);
554: }
555: /* ----------------------------------------------------------- */
556: /* Almost same code as the MatMult_SeqAIJ_Inode() */
557: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
558: {
559:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
560:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
561:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
562:   const PetscScalar *x;
563:   PetscScalar       *y,*z,*zt;
564:   PetscErrorCode    ierr;
565:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
566:   const PetscInt    *idx,*ns,*ii;

569:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
570:   node_max = a->inode.node_count;
571:   ns       = a->inode.size;     /* Node Size array */

573:   VecGetArrayRead(xx,&x);
574:   VecGetArrayPair(zz,yy,&z,&y);
575:   zt = z;

577:   idx = a->j;
578:   v1  = a->a;
579:   ii  = a->i;

581:   for (i = 0,row = 0; i< node_max; ++i) {
582:     nsz = ns[i];
583:     n   = ii[1] - ii[0];
584:     ii += nsz;
585:     sz  = n;                    /* No of non zeros in this row */
586:                                 /* Switch on the size of Node */
587:     switch (nsz) {               /* Each loop in 'case' is unrolled */
588:     case 1:
589:       sum1 = *zt++;

591:       for (n = 0; n< sz-1; n+=2) {
592:         i1    = idx[0];         /* The instructions are ordered to */
593:         i2    = idx[1];         /* make the compiler's job easy */
594:         idx  += 2;
595:         tmp0  = x[i1];
596:         tmp1  = x[i2];
597:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598:       }

600:       if (n   == sz-1) {          /* Take care of the last nonzero  */
601:         tmp0  = x[*idx++];
602:         sum1 += *v1++ * tmp0;
603:       }
604:       y[row++]=sum1;
605:       break;
606:     case 2:
607:       sum1 = *zt++;
608:       sum2 = *zt++;
609:       v2   = v1 + n;

611:       for (n = 0; n< sz-1; n+=2) {
612:         i1    = idx[0];
613:         i2    = idx[1];
614:         idx  += 2;
615:         tmp0  = x[i1];
616:         tmp1  = x[i2];
617:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
618:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
619:       }
620:       if (n   == sz-1) {
621:         tmp0  = x[*idx++];
622:         sum1 += *v1++ * tmp0;
623:         sum2 += *v2++ * tmp0;
624:       }
625:       y[row++]=sum1;
626:       y[row++]=sum2;
627:       v1      =v2;              /* Since the next block to be processed starts there*/
628:       idx    +=sz;
629:       break;
630:     case 3:
631:       sum1 = *zt++;
632:       sum2 = *zt++;
633:       sum3 = *zt++;
634:       v2   = v1 + n;
635:       v3   = v2 + n;

637:       for (n = 0; n< sz-1; n+=2) {
638:         i1    = idx[0];
639:         i2    = idx[1];
640:         idx  += 2;
641:         tmp0  = x[i1];
642:         tmp1  = x[i2];
643:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
644:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
645:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
646:       }
647:       if (n == sz-1) {
648:         tmp0  = x[*idx++];
649:         sum1 += *v1++ * tmp0;
650:         sum2 += *v2++ * tmp0;
651:         sum3 += *v3++ * tmp0;
652:       }
653:       y[row++]=sum1;
654:       y[row++]=sum2;
655:       y[row++]=sum3;
656:       v1      =v3;              /* Since the next block to be processed starts there*/
657:       idx    +=2*sz;
658:       break;
659:     case 4:
660:       sum1 = *zt++;
661:       sum2 = *zt++;
662:       sum3 = *zt++;
663:       sum4 = *zt++;
664:       v2   = v1 + n;
665:       v3   = v2 + n;
666:       v4   = v3 + n;

668:       for (n = 0; n< sz-1; n+=2) {
669:         i1    = idx[0];
670:         i2    = idx[1];
671:         idx  += 2;
672:         tmp0  = x[i1];
673:         tmp1  = x[i2];
674:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
675:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
676:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
677:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
678:       }
679:       if (n == sz-1) {
680:         tmp0  = x[*idx++];
681:         sum1 += *v1++ * tmp0;
682:         sum2 += *v2++ * tmp0;
683:         sum3 += *v3++ * tmp0;
684:         sum4 += *v4++ * tmp0;
685:       }
686:       y[row++]=sum1;
687:       y[row++]=sum2;
688:       y[row++]=sum3;
689:       y[row++]=sum4;
690:       v1      =v4;              /* Since the next block to be processed starts there*/
691:       idx    +=3*sz;
692:       break;
693:     case 5:
694:       sum1 = *zt++;
695:       sum2 = *zt++;
696:       sum3 = *zt++;
697:       sum4 = *zt++;
698:       sum5 = *zt++;
699:       v2   = v1 + n;
700:       v3   = v2 + n;
701:       v4   = v3 + n;
702:       v5   = v4 + n;

704:       for (n = 0; n<sz-1; n+=2) {
705:         i1    = idx[0];
706:         i2    = idx[1];
707:         idx  += 2;
708:         tmp0  = x[i1];
709:         tmp1  = x[i2];
710:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
711:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
712:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
713:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
714:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
715:       }
716:       if (n   == sz-1) {
717:         tmp0  = x[*idx++];
718:         sum1 += *v1++ * tmp0;
719:         sum2 += *v2++ * tmp0;
720:         sum3 += *v3++ * tmp0;
721:         sum4 += *v4++ * tmp0;
722:         sum5 += *v5++ * tmp0;
723:       }
724:       y[row++]=sum1;
725:       y[row++]=sum2;
726:       y[row++]=sum3;
727:       y[row++]=sum4;
728:       y[row++]=sum5;
729:       v1      =v5;       /* Since the next block to be processed starts there */
730:       idx    +=4*sz;
731:       break;
732:     default:
733:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
734:     }
735:   }
736:   VecRestoreArrayRead(xx,&x);
737:   VecRestoreArrayPair(zz,yy,&z,&y);
738:   PetscLogFlops(2.0*a->nz);
739:   return(0);
740: }

742: /* ----------------------------------------------------------- */
743: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
744: {
745:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
746:   IS                iscol = a->col,isrow = a->row;
747:   PetscErrorCode    ierr;
748:   const PetscInt    *r,*c,*rout,*cout;
749:   PetscInt          i,j,n = A->rmap->n,nz;
750:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
751:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
752:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
753:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
754:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
755:   const PetscScalar *b;

758:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
759:   node_max = a->inode.node_count;
760:   ns       = a->inode.size;     /* Node Size array */

762:   VecGetArrayRead(bb,&b);
763:   VecGetArrayWrite(xx,&x);
764:   tmp  = a->solve_work;

766:   ISGetIndices(isrow,&rout); r = rout;
767:   ISGetIndices(iscol,&cout); c = cout + (n-1);

769:   /* forward solve the lower triangular */
770:   tmps = tmp;
771:   aa   = a_a;
772:   aj   = a_j;
773:   ad   = a->diag;

775:   for (i = 0,row = 0; i< node_max; ++i) {
776:     nsz = ns[i];
777:     aii = ai[row];
778:     v1  = aa + aii;
779:     vi  = aj + aii;
780:     nz  = ad[row]- aii;
781:     if (i < node_max-1) {
782:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
783:       * but our indexing to determine it's size could. */
784:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
785:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
786:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
787:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
788:     }

790:     switch (nsz) {               /* Each loop in 'case' is unrolled */
791:     case 1:
792:       sum1 = b[*r++];
793:       for (j=0; j<nz-1; j+=2) {
794:         i0    = vi[0];
795:         i1    = vi[1];
796:         vi   +=2;
797:         tmp0  = tmps[i0];
798:         tmp1  = tmps[i1];
799:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
800:       }
801:       if (j == nz-1) {
802:         tmp0  = tmps[*vi++];
803:         sum1 -= *v1++ *tmp0;
804:       }
805:       tmp[row++]=sum1;
806:       break;
807:     case 2:
808:       sum1 = b[*r++];
809:       sum2 = b[*r++];
810:       v2   = aa + ai[row+1];

812:       for (j=0; j<nz-1; j+=2) {
813:         i0    = vi[0];
814:         i1    = vi[1];
815:         vi   +=2;
816:         tmp0  = tmps[i0];
817:         tmp1  = tmps[i1];
818:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
819:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
820:       }
821:       if (j == nz-1) {
822:         tmp0  = tmps[*vi++];
823:         sum1 -= *v1++ *tmp0;
824:         sum2 -= *v2++ *tmp0;
825:       }
826:       sum2     -= *v2++ *sum1;
827:       tmp[row++]=sum1;
828:       tmp[row++]=sum2;
829:       break;
830:     case 3:
831:       sum1 = b[*r++];
832:       sum2 = b[*r++];
833:       sum3 = b[*r++];
834:       v2   = aa + ai[row+1];
835:       v3   = aa + ai[row+2];

837:       for (j=0; j<nz-1; j+=2) {
838:         i0    = vi[0];
839:         i1    = vi[1];
840:         vi   +=2;
841:         tmp0  = tmps[i0];
842:         tmp1  = tmps[i1];
843:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
844:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
845:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
846:       }
847:       if (j == nz-1) {
848:         tmp0  = tmps[*vi++];
849:         sum1 -= *v1++ *tmp0;
850:         sum2 -= *v2++ *tmp0;
851:         sum3 -= *v3++ *tmp0;
852:       }
853:       sum2 -= *v2++ * sum1;
854:       sum3 -= *v3++ * sum1;
855:       sum3 -= *v3++ * sum2;

857:       tmp[row++]=sum1;
858:       tmp[row++]=sum2;
859:       tmp[row++]=sum3;
860:       break;

862:     case 4:
863:       sum1 = b[*r++];
864:       sum2 = b[*r++];
865:       sum3 = b[*r++];
866:       sum4 = b[*r++];
867:       v2   = aa + ai[row+1];
868:       v3   = aa + ai[row+2];
869:       v4   = aa + ai[row+3];

871:       for (j=0; j<nz-1; j+=2) {
872:         i0    = vi[0];
873:         i1    = vi[1];
874:         vi   +=2;
875:         tmp0  = tmps[i0];
876:         tmp1  = tmps[i1];
877:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
878:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
879:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
880:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
881:       }
882:       if (j == nz-1) {
883:         tmp0  = tmps[*vi++];
884:         sum1 -= *v1++ *tmp0;
885:         sum2 -= *v2++ *tmp0;
886:         sum3 -= *v3++ *tmp0;
887:         sum4 -= *v4++ *tmp0;
888:       }
889:       sum2 -= *v2++ * sum1;
890:       sum3 -= *v3++ * sum1;
891:       sum4 -= *v4++ * sum1;
892:       sum3 -= *v3++ * sum2;
893:       sum4 -= *v4++ * sum2;
894:       sum4 -= *v4++ * sum3;

896:       tmp[row++]=sum1;
897:       tmp[row++]=sum2;
898:       tmp[row++]=sum3;
899:       tmp[row++]=sum4;
900:       break;
901:     case 5:
902:       sum1 = b[*r++];
903:       sum2 = b[*r++];
904:       sum3 = b[*r++];
905:       sum4 = b[*r++];
906:       sum5 = b[*r++];
907:       v2   = aa + ai[row+1];
908:       v3   = aa + ai[row+2];
909:       v4   = aa + ai[row+3];
910:       v5   = aa + ai[row+4];

912:       for (j=0; j<nz-1; j+=2) {
913:         i0    = vi[0];
914:         i1    = vi[1];
915:         vi   +=2;
916:         tmp0  = tmps[i0];
917:         tmp1  = tmps[i1];
918:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
919:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
920:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
921:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
922:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
923:       }
924:       if (j == nz-1) {
925:         tmp0  = tmps[*vi++];
926:         sum1 -= *v1++ *tmp0;
927:         sum2 -= *v2++ *tmp0;
928:         sum3 -= *v3++ *tmp0;
929:         sum4 -= *v4++ *tmp0;
930:         sum5 -= *v5++ *tmp0;
931:       }

933:       sum2 -= *v2++ * sum1;
934:       sum3 -= *v3++ * sum1;
935:       sum4 -= *v4++ * sum1;
936:       sum5 -= *v5++ * sum1;
937:       sum3 -= *v3++ * sum2;
938:       sum4 -= *v4++ * sum2;
939:       sum5 -= *v5++ * sum2;
940:       sum4 -= *v4++ * sum3;
941:       sum5 -= *v5++ * sum3;
942:       sum5 -= *v5++ * sum4;

944:       tmp[row++]=sum1;
945:       tmp[row++]=sum2;
946:       tmp[row++]=sum3;
947:       tmp[row++]=sum4;
948:       tmp[row++]=sum5;
949:       break;
950:     default:
951:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
952:     }
953:   }
954:   /* backward solve the upper triangular */
955:   for (i=node_max -1,row = n-1; i>=0; i--) {
956:     nsz = ns[i];
957:     aii = ai[row+1] -1;
958:     v1  = aa + aii;
959:     vi  = aj + aii;
960:     nz  = aii- ad[row];
961:     switch (nsz) {               /* Each loop in 'case' is unrolled */
962:     case 1:
963:       sum1 = tmp[row];

965:       for (j=nz; j>1; j-=2) {
966:         vi   -=2;
967:         i0    = vi[2];
968:         i1    = vi[1];
969:         tmp0  = tmps[i0];
970:         tmp1  = tmps[i1];
971:         v1   -= 2;
972:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
973:       }
974:       if (j==1) {
975:         tmp0  = tmps[*vi--];
976:         sum1 -= *v1-- * tmp0;
977:       }
978:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
979:       break;
980:     case 2:
981:       sum1 = tmp[row];
982:       sum2 = tmp[row -1];
983:       v2   = aa + ai[row]-1;
984:       for (j=nz; j>1; j-=2) {
985:         vi   -=2;
986:         i0    = vi[2];
987:         i1    = vi[1];
988:         tmp0  = tmps[i0];
989:         tmp1  = tmps[i1];
990:         v1   -= 2;
991:         v2   -= 2;
992:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
993:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
994:       }
995:       if (j==1) {
996:         tmp0  = tmps[*vi--];
997:         sum1 -= *v1-- * tmp0;
998:         sum2 -= *v2-- * tmp0;
999:       }

1001:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1002:       sum2   -= *v2-- * tmp0;
1003:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1004:       break;
1005:     case 3:
1006:       sum1 = tmp[row];
1007:       sum2 = tmp[row -1];
1008:       sum3 = tmp[row -2];
1009:       v2   = aa + ai[row]-1;
1010:       v3   = aa + ai[row -1]-1;
1011:       for (j=nz; j>1; j-=2) {
1012:         vi   -=2;
1013:         i0    = vi[2];
1014:         i1    = vi[1];
1015:         tmp0  = tmps[i0];
1016:         tmp1  = tmps[i1];
1017:         v1   -= 2;
1018:         v2   -= 2;
1019:         v3   -= 2;
1020:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1023:       }
1024:       if (j==1) {
1025:         tmp0  = tmps[*vi--];
1026:         sum1 -= *v1-- * tmp0;
1027:         sum2 -= *v2-- * tmp0;
1028:         sum3 -= *v3-- * tmp0;
1029:       }
1030:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031:       sum2   -= *v2-- * tmp0;
1032:       sum3   -= *v3-- * tmp0;
1033:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1034:       sum3   -= *v3-- * tmp0;
1035:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1037:       break;
1038:     case 4:
1039:       sum1 = tmp[row];
1040:       sum2 = tmp[row -1];
1041:       sum3 = tmp[row -2];
1042:       sum4 = tmp[row -3];
1043:       v2   = aa + ai[row]-1;
1044:       v3   = aa + ai[row -1]-1;
1045:       v4   = aa + ai[row -2]-1;

1047:       for (j=nz; j>1; j-=2) {
1048:         vi   -=2;
1049:         i0    = vi[2];
1050:         i1    = vi[1];
1051:         tmp0  = tmps[i0];
1052:         tmp1  = tmps[i1];
1053:         v1   -= 2;
1054:         v2   -= 2;
1055:         v3   -= 2;
1056:         v4   -= 2;
1057:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1058:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1059:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1060:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1061:       }
1062:       if (j==1) {
1063:         tmp0  = tmps[*vi--];
1064:         sum1 -= *v1-- * tmp0;
1065:         sum2 -= *v2-- * tmp0;
1066:         sum3 -= *v3-- * tmp0;
1067:         sum4 -= *v4-- * tmp0;
1068:       }

1070:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1071:       sum2   -= *v2-- * tmp0;
1072:       sum3   -= *v3-- * tmp0;
1073:       sum4   -= *v4-- * tmp0;
1074:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1075:       sum3   -= *v3-- * tmp0;
1076:       sum4   -= *v4-- * tmp0;
1077:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1078:       sum4   -= *v4-- * tmp0;
1079:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1080:       break;
1081:     case 5:
1082:       sum1 = tmp[row];
1083:       sum2 = tmp[row -1];
1084:       sum3 = tmp[row -2];
1085:       sum4 = tmp[row -3];
1086:       sum5 = tmp[row -4];
1087:       v2   = aa + ai[row]-1;
1088:       v3   = aa + ai[row -1]-1;
1089:       v4   = aa + ai[row -2]-1;
1090:       v5   = aa + ai[row -3]-1;
1091:       for (j=nz; j>1; j-=2) {
1092:         vi   -= 2;
1093:         i0    = vi[2];
1094:         i1    = vi[1];
1095:         tmp0  = tmps[i0];
1096:         tmp1  = tmps[i1];
1097:         v1   -= 2;
1098:         v2   -= 2;
1099:         v3   -= 2;
1100:         v4   -= 2;
1101:         v5   -= 2;
1102:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1103:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1104:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1105:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1106:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1107:       }
1108:       if (j==1) {
1109:         tmp0  = tmps[*vi--];
1110:         sum1 -= *v1-- * tmp0;
1111:         sum2 -= *v2-- * tmp0;
1112:         sum3 -= *v3-- * tmp0;
1113:         sum4 -= *v4-- * tmp0;
1114:         sum5 -= *v5-- * tmp0;
1115:       }

1117:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1118:       sum2   -= *v2-- * tmp0;
1119:       sum3   -= *v3-- * tmp0;
1120:       sum4   -= *v4-- * tmp0;
1121:       sum5   -= *v5-- * tmp0;
1122:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1123:       sum3   -= *v3-- * tmp0;
1124:       sum4   -= *v4-- * tmp0;
1125:       sum5   -= *v5-- * tmp0;
1126:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1127:       sum4   -= *v4-- * tmp0;
1128:       sum5   -= *v5-- * tmp0;
1129:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1130:       sum5   -= *v5-- * tmp0;
1131:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1132:       break;
1133:     default:
1134:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1135:     }
1136:   }
1137:   ISRestoreIndices(isrow,&rout);
1138:   ISRestoreIndices(iscol,&cout);
1139:   VecRestoreArrayRead(bb,&b);
1140:   VecRestoreArrayWrite(xx,&x);
1141:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1142:   return(0);
1143: }

1145: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1146: {
1147:   Mat             C     =B;
1148:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1149:   IS              isrow = b->row,isicol = b->icol;
1150:   PetscErrorCode  ierr;
1151:   const PetscInt  *r,*ic,*ics;
1152:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1153:   PetscInt        i,j,k,nz,nzL,row,*pj;
1154:   const PetscInt  *ajtmp,*bjtmp;
1155:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1156:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1157:   FactorShiftCtx  sctx;
1158:   const PetscInt  *ddiag;
1159:   PetscReal       rs;
1160:   MatScalar       d;
1161:   PetscInt        inod,nodesz,node_max,col;
1162:   const PetscInt  *ns;
1163:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1166:   /* MatPivotSetUp(): initialize shift context sctx */
1167:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1169:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1170:     ddiag          = a->diag;
1171:     sctx.shift_top = info->zeropivot;
1172:     for (i=0; i<n; i++) {
1173:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1174:       d  = (aa)[ddiag[i]];
1175:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1176:       v  = aa+ai[i];
1177:       nz = ai[i+1] - ai[i];
1178:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1179:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1180:     }
1181:     sctx.shift_top *= 1.1;
1182:     sctx.nshift_max = 5;
1183:     sctx.shift_lo   = 0.;
1184:     sctx.shift_hi   = 1.;
1185:   }

1187:   ISGetIndices(isrow,&r);
1188:   ISGetIndices(isicol,&ic);

1190:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1191:   ics   = ic;

1193:   node_max = a->inode.node_count;
1194:   ns       = a->inode.size;
1195:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1197:   /* If max inode size > 4, split it into two inodes.*/
1198:   /* also map the inode sizes according to the ordering */
1199:   PetscMalloc1(n+1,&tmp_vec1);
1200:   for (i=0,j=0; i<node_max; ++i,++j) {
1201:     if (ns[i] > 4) {
1202:       tmp_vec1[j] = 4;
1203:       ++j;
1204:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1205:     } else {
1206:       tmp_vec1[j] = ns[i];
1207:     }
1208:   }
1209:   /* Use the correct node_max */
1210:   node_max = j;

1212:   /* Now reorder the inode info based on mat re-ordering info */
1213:   /* First create a row -> inode_size_array_index map */
1214:   PetscMalloc1(n+1,&nsmap);
1215:   PetscMalloc1(node_max+1,&tmp_vec2);
1216:   for (i=0,row=0; i<node_max; i++) {
1217:     nodesz = tmp_vec1[i];
1218:     for (j=0; j<nodesz; j++,row++) {
1219:       nsmap[row] = i;
1220:     }
1221:   }
1222:   /* Using nsmap, create a reordered ns structure */
1223:   for (i=0,j=0; i< node_max; i++) {
1224:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1225:     tmp_vec2[i] = nodesz;
1226:     j          += nodesz;
1227:   }
1228:   PetscFree(nsmap);
1229:   PetscFree(tmp_vec1);

1231:   /* Now use the correct ns */
1232:   ns = tmp_vec2;

1234:   do {
1235:     sctx.newshift = PETSC_FALSE;
1236:     /* Now loop over each block-row, and do the factorization */
1237:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1238:       nodesz = ns[inod];

1240:       switch (nodesz) {
1241:       case 1:
1242:         /*----------*/
1243:         /* zero rtmp1 */
1244:         /* L part */
1245:         nz    = bi[i+1] - bi[i];
1246:         bjtmp = bj + bi[i];
1247:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1249:         /* U part */
1250:         nz    = bdiag[i]-bdiag[i+1];
1251:         bjtmp = bj + bdiag[i+1]+1;
1252:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1254:         /* load in initial (unfactored row) */
1255:         nz    = ai[r[i]+1] - ai[r[i]];
1256:         ajtmp = aj + ai[r[i]];
1257:         v     = aa + ai[r[i]];
1258:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1260:         /* ZeropivotApply() */
1261:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1263:         /* elimination */
1264:         bjtmp = bj + bi[i];
1265:         row   = *bjtmp++;
1266:         nzL   = bi[i+1] - bi[i];
1267:         for (k=0; k < nzL; k++) {
1268:           pc = rtmp1 + row;
1269:           if (*pc != 0.0) {
1270:             pv   = b->a + bdiag[row];
1271:             mul1 = *pc * (*pv);
1272:             *pc  = mul1;
1273:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1274:             pv   = b->a + bdiag[row+1]+1;
1275:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1276:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1277:             PetscLogFlops(1+2*nz);
1278:           }
1279:           row = *bjtmp++;
1280:         }

1282:         /* finished row so stick it into b->a */
1283:         rs = 0.0;
1284:         /* L part */
1285:         pv = b->a + bi[i];
1286:         pj = b->j + bi[i];
1287:         nz = bi[i+1] - bi[i];
1288:         for (j=0; j<nz; j++) {
1289:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1290:         }

1292:         /* U part */
1293:         pv = b->a + bdiag[i+1]+1;
1294:         pj = b->j + bdiag[i+1]+1;
1295:         nz = bdiag[i] - bdiag[i+1]-1;
1296:         for (j=0; j<nz; j++) {
1297:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1298:         }

1300:         /* Check zero pivot */
1301:         sctx.rs = rs;
1302:         sctx.pv = rtmp1[i];
1303:         MatPivotCheck(B,A,info,&sctx,i);
1304:         if (sctx.newshift) break;

1306:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1307:         pv  = b->a + bdiag[i];
1308:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1309:         break;

1311:       case 2:
1312:         /*----------*/
1313:         /* zero rtmp1 and rtmp2 */
1314:         /* L part */
1315:         nz    = bi[i+1] - bi[i];
1316:         bjtmp = bj + bi[i];
1317:         for  (j=0; j<nz; j++) {
1318:           col        = bjtmp[j];
1319:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1320:         }

1322:         /* U part */
1323:         nz    = bdiag[i]-bdiag[i+1];
1324:         bjtmp = bj + bdiag[i+1]+1;
1325:         for  (j=0; j<nz; j++) {
1326:           col        = bjtmp[j];
1327:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1328:         }

1330:         /* load in initial (unfactored row) */
1331:         nz    = ai[r[i]+1] - ai[r[i]];
1332:         ajtmp = aj + ai[r[i]];
1333:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1334:         for (j=0; j<nz; j++) {
1335:           col        = ics[ajtmp[j]];
1336:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1337:         }
1338:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1339:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1341:         /* elimination */
1342:         bjtmp = bj + bi[i];
1343:         row   = *bjtmp++; /* pivot row */
1344:         nzL   = bi[i+1] - bi[i];
1345:         for (k=0; k < nzL; k++) {
1346:           pc1 = rtmp1 + row;
1347:           pc2 = rtmp2 + row;
1348:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1349:             pv   = b->a + bdiag[row];
1350:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1351:             *pc1 = mul1;       *pc2 = mul2;

1353:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1354:             pv = b->a + bdiag[row+1]+1;
1355:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1356:             for (j=0; j<nz; j++) {
1357:               col         = pj[j];
1358:               rtmp1[col] -= mul1 * pv[j];
1359:               rtmp2[col] -= mul2 * pv[j];
1360:             }
1361:             PetscLogFlops(2+4*nz);
1362:           }
1363:           row = *bjtmp++;
1364:         }

1366:         /* finished row i; check zero pivot, then stick row i into b->a */
1367:         rs = 0.0;
1368:         /* L part */
1369:         pc1 = b->a + bi[i];
1370:         pj  = b->j + bi[i];
1371:         nz  = bi[i+1] - bi[i];
1372:         for (j=0; j<nz; j++) {
1373:           col    = pj[j];
1374:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1375:         }
1376:         /* U part */
1377:         pc1 = b->a + bdiag[i+1]+1;
1378:         pj  = b->j + bdiag[i+1]+1;
1379:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1380:         for (j=0; j<nz; j++) {
1381:           col    = pj[j];
1382:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1383:         }

1385:         sctx.rs = rs;
1386:         sctx.pv = rtmp1[i];
1387:         MatPivotCheck(B,A,info,&sctx,i);
1388:         if (sctx.newshift) break;
1389:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1390:         *pc1 = 1.0/sctx.pv;

1392:         /* Now take care of diagonal 2x2 block. */
1393:         pc2 = rtmp2 + i;
1394:         if (*pc2 != 0.0) {
1395:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1396:           *pc2 = mul1;          /* insert L entry */
1397:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1398:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1399:           for (j=0; j<nz; j++) {
1400:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1401:           }
1402:           PetscLogFlops(1+2*nz);
1403:         }

1405:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1406:         rs = 0.0;
1407:         /* L part */
1408:         pc2 = b->a + bi[i+1];
1409:         pj  = b->j + bi[i+1];
1410:         nz  = bi[i+2] - bi[i+1];
1411:         for (j=0; j<nz; j++) {
1412:           col    = pj[j];
1413:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1414:         }
1415:         /* U part */
1416:         pc2 = b->a + bdiag[i+2]+1;
1417:         pj  = b->j + bdiag[i+2]+1;
1418:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1419:         for (j=0; j<nz; j++) {
1420:           col    = pj[j];
1421:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1422:         }

1424:         sctx.rs = rs;
1425:         sctx.pv = rtmp2[i+1];
1426:         MatPivotCheck(B,A,info,&sctx,i+1);
1427:         if (sctx.newshift) break;
1428:         pc2  = b->a + bdiag[i+1];
1429:         *pc2 = 1.0/sctx.pv;
1430:         break;

1432:       case 3:
1433:         /*----------*/
1434:         /* zero rtmp */
1435:         /* L part */
1436:         nz    = bi[i+1] - bi[i];
1437:         bjtmp = bj + bi[i];
1438:         for  (j=0; j<nz; j++) {
1439:           col        = bjtmp[j];
1440:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1441:         }

1443:         /* U part */
1444:         nz    = bdiag[i]-bdiag[i+1];
1445:         bjtmp = bj + bdiag[i+1]+1;
1446:         for  (j=0; j<nz; j++) {
1447:           col        = bjtmp[j];
1448:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1449:         }

1451:         /* load in initial (unfactored row) */
1452:         nz    = ai[r[i]+1] - ai[r[i]];
1453:         ajtmp = aj + ai[r[i]];
1454:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1455:         for (j=0; j<nz; j++) {
1456:           col        = ics[ajtmp[j]];
1457:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1458:         }
1459:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1460:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1462:         /* elimination */
1463:         bjtmp = bj + bi[i];
1464:         row   = *bjtmp++; /* pivot row */
1465:         nzL   = bi[i+1] - bi[i];
1466:         for (k=0; k < nzL; k++) {
1467:           pc1 = rtmp1 + row;
1468:           pc2 = rtmp2 + row;
1469:           pc3 = rtmp3 + row;
1470:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1471:             pv   = b->a + bdiag[row];
1472:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1473:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1475:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1476:             pv = b->a + bdiag[row+1]+1;
1477:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1478:             for (j=0; j<nz; j++) {
1479:               col         = pj[j];
1480:               rtmp1[col] -= mul1 * pv[j];
1481:               rtmp2[col] -= mul2 * pv[j];
1482:               rtmp3[col] -= mul3 * pv[j];
1483:             }
1484:             PetscLogFlops(3+6*nz);
1485:           }
1486:           row = *bjtmp++;
1487:         }

1489:         /* finished row i; check zero pivot, then stick row i into b->a */
1490:         rs = 0.0;
1491:         /* L part */
1492:         pc1 = b->a + bi[i];
1493:         pj  = b->j + bi[i];
1494:         nz  = bi[i+1] - bi[i];
1495:         for (j=0; j<nz; j++) {
1496:           col    = pj[j];
1497:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1498:         }
1499:         /* U part */
1500:         pc1 = b->a + bdiag[i+1]+1;
1501:         pj  = b->j + bdiag[i+1]+1;
1502:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1503:         for (j=0; j<nz; j++) {
1504:           col    = pj[j];
1505:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1506:         }

1508:         sctx.rs = rs;
1509:         sctx.pv = rtmp1[i];
1510:         MatPivotCheck(B,A,info,&sctx,i);
1511:         if (sctx.newshift) break;
1512:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1513:         *pc1 = 1.0/sctx.pv;

1515:         /* Now take care of 1st column of diagonal 3x3 block. */
1516:         pc2 = rtmp2 + i;
1517:         pc3 = rtmp3 + i;
1518:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1519:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1520:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1521:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1522:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1523:           for (j=0; j<nz; j++) {
1524:             col         = pj[j];
1525:             rtmp2[col] -= mul2 * rtmp1[col];
1526:             rtmp3[col] -= mul3 * rtmp1[col];
1527:           }
1528:           PetscLogFlops(2+4*nz);
1529:         }

1531:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1532:         rs = 0.0;
1533:         /* L part */
1534:         pc2 = b->a + bi[i+1];
1535:         pj  = b->j + bi[i+1];
1536:         nz  = bi[i+2] - bi[i+1];
1537:         for (j=0; j<nz; j++) {
1538:           col    = pj[j];
1539:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1540:         }
1541:         /* U part */
1542:         pc2 = b->a + bdiag[i+2]+1;
1543:         pj  = b->j + bdiag[i+2]+1;
1544:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1545:         for (j=0; j<nz; j++) {
1546:           col    = pj[j];
1547:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1548:         }

1550:         sctx.rs = rs;
1551:         sctx.pv = rtmp2[i+1];
1552:         MatPivotCheck(B,A,info,&sctx,i+1);
1553:         if (sctx.newshift) break;
1554:         pc2  = b->a + bdiag[i+1];
1555:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1557:         /* Now take care of 2nd column of diagonal 3x3 block. */
1558:         pc3 = rtmp3 + i+1;
1559:         if (*pc3 != 0.0) {
1560:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1561:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1562:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1563:           for (j=0; j<nz; j++) {
1564:             col         = pj[j];
1565:             rtmp3[col] -= mul3 * rtmp2[col];
1566:           }
1567:           PetscLogFlops(1+2*nz);
1568:         }

1570:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1571:         rs = 0.0;
1572:         /* L part */
1573:         pc3 = b->a + bi[i+2];
1574:         pj  = b->j + bi[i+2];
1575:         nz  = bi[i+3] - bi[i+2];
1576:         for (j=0; j<nz; j++) {
1577:           col    = pj[j];
1578:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1579:         }
1580:         /* U part */
1581:         pc3 = b->a + bdiag[i+3]+1;
1582:         pj  = b->j + bdiag[i+3]+1;
1583:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1584:         for (j=0; j<nz; j++) {
1585:           col    = pj[j];
1586:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1587:         }

1589:         sctx.rs = rs;
1590:         sctx.pv = rtmp3[i+2];
1591:         MatPivotCheck(B,A,info,&sctx,i+2);
1592:         if (sctx.newshift) break;
1593:         pc3  = b->a + bdiag[i+2];
1594:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1595:         break;
1596:       case 4:
1597:         /*----------*/
1598:         /* zero rtmp */
1599:         /* L part */
1600:         nz    = bi[i+1] - bi[i];
1601:         bjtmp = bj + bi[i];
1602:         for  (j=0; j<nz; j++) {
1603:           col        = bjtmp[j];
1604:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1605:         }

1607:         /* U part */
1608:         nz    = bdiag[i]-bdiag[i+1];
1609:         bjtmp = bj + bdiag[i+1]+1;
1610:         for  (j=0; j<nz; j++) {
1611:           col        = bjtmp[j];
1612:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1613:         }

1615:         /* load in initial (unfactored row) */
1616:         nz    = ai[r[i]+1] - ai[r[i]];
1617:         ajtmp = aj + ai[r[i]];
1618:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1619:         for (j=0; j<nz; j++) {
1620:           col        = ics[ajtmp[j]];
1621:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1622:         }
1623:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1624:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1626:         /* elimination */
1627:         bjtmp = bj + bi[i];
1628:         row   = *bjtmp++; /* pivot row */
1629:         nzL   = bi[i+1] - bi[i];
1630:         for (k=0; k < nzL; k++) {
1631:           pc1 = rtmp1 + row;
1632:           pc2 = rtmp2 + row;
1633:           pc3 = rtmp3 + row;
1634:           pc4 = rtmp4 + row;
1635:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1636:             pv   = b->a + bdiag[row];
1637:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1638:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1640:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1641:             pv = b->a + bdiag[row+1]+1;
1642:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1643:             for (j=0; j<nz; j++) {
1644:               col         = pj[j];
1645:               rtmp1[col] -= mul1 * pv[j];
1646:               rtmp2[col] -= mul2 * pv[j];
1647:               rtmp3[col] -= mul3 * pv[j];
1648:               rtmp4[col] -= mul4 * pv[j];
1649:             }
1650:             PetscLogFlops(4+8*nz);
1651:           }
1652:           row = *bjtmp++;
1653:         }

1655:         /* finished row i; check zero pivot, then stick row i into b->a */
1656:         rs = 0.0;
1657:         /* L part */
1658:         pc1 = b->a + bi[i];
1659:         pj  = b->j + bi[i];
1660:         nz  = bi[i+1] - bi[i];
1661:         for (j=0; j<nz; j++) {
1662:           col    = pj[j];
1663:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1664:         }
1665:         /* U part */
1666:         pc1 = b->a + bdiag[i+1]+1;
1667:         pj  = b->j + bdiag[i+1]+1;
1668:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1669:         for (j=0; j<nz; j++) {
1670:           col    = pj[j];
1671:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1672:         }

1674:         sctx.rs = rs;
1675:         sctx.pv = rtmp1[i];
1676:         MatPivotCheck(B,A,info,&sctx,i);
1677:         if (sctx.newshift) break;
1678:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1679:         *pc1 = 1.0/sctx.pv;

1681:         /* Now take care of 1st column of diagonal 4x4 block. */
1682:         pc2 = rtmp2 + i;
1683:         pc3 = rtmp3 + i;
1684:         pc4 = rtmp4 + i;
1685:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1686:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1687:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1688:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1689:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1690:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1691:           for (j=0; j<nz; j++) {
1692:             col         = pj[j];
1693:             rtmp2[col] -= mul2 * rtmp1[col];
1694:             rtmp3[col] -= mul3 * rtmp1[col];
1695:             rtmp4[col] -= mul4 * rtmp1[col];
1696:           }
1697:           PetscLogFlops(3+6*nz);
1698:         }

1700:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1701:         rs = 0.0;
1702:         /* L part */
1703:         pc2 = b->a + bi[i+1];
1704:         pj  = b->j + bi[i+1];
1705:         nz  = bi[i+2] - bi[i+1];
1706:         for (j=0; j<nz; j++) {
1707:           col    = pj[j];
1708:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1709:         }
1710:         /* U part */
1711:         pc2 = b->a + bdiag[i+2]+1;
1712:         pj  = b->j + bdiag[i+2]+1;
1713:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1714:         for (j=0; j<nz; j++) {
1715:           col    = pj[j];
1716:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1717:         }

1719:         sctx.rs = rs;
1720:         sctx.pv = rtmp2[i+1];
1721:         MatPivotCheck(B,A,info,&sctx,i+1);
1722:         if (sctx.newshift) break;
1723:         pc2  = b->a + bdiag[i+1];
1724:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1726:         /* Now take care of 2nd column of diagonal 4x4 block. */
1727:         pc3 = rtmp3 + i+1;
1728:         pc4 = rtmp4 + i+1;
1729:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1730:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1731:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1732:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1733:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1734:           for (j=0; j<nz; j++) {
1735:             col         = pj[j];
1736:             rtmp3[col] -= mul3 * rtmp2[col];
1737:             rtmp4[col] -= mul4 * rtmp2[col];
1738:           }
1739:           PetscLogFlops(4*nz);
1740:         }

1742:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1743:         rs = 0.0;
1744:         /* L part */
1745:         pc3 = b->a + bi[i+2];
1746:         pj  = b->j + bi[i+2];
1747:         nz  = bi[i+3] - bi[i+2];
1748:         for (j=0; j<nz; j++) {
1749:           col    = pj[j];
1750:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1751:         }
1752:         /* U part */
1753:         pc3 = b->a + bdiag[i+3]+1;
1754:         pj  = b->j + bdiag[i+3]+1;
1755:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1756:         for (j=0; j<nz; j++) {
1757:           col    = pj[j];
1758:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1759:         }

1761:         sctx.rs = rs;
1762:         sctx.pv = rtmp3[i+2];
1763:         MatPivotCheck(B,A,info,&sctx,i+2);
1764:         if (sctx.newshift) break;
1765:         pc3  = b->a + bdiag[i+2];
1766:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1768:         /* Now take care of 3rd column of diagonal 4x4 block. */
1769:         pc4 = rtmp4 + i+2;
1770:         if (*pc4 != 0.0) {
1771:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1772:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1773:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1774:           for (j=0; j<nz; j++) {
1775:             col         = pj[j];
1776:             rtmp4[col] -= mul4 * rtmp3[col];
1777:           }
1778:           PetscLogFlops(1+2*nz);
1779:         }

1781:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1782:         rs = 0.0;
1783:         /* L part */
1784:         pc4 = b->a + bi[i+3];
1785:         pj  = b->j + bi[i+3];
1786:         nz  = bi[i+4] - bi[i+3];
1787:         for (j=0; j<nz; j++) {
1788:           col    = pj[j];
1789:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1790:         }
1791:         /* U part */
1792:         pc4 = b->a + bdiag[i+4]+1;
1793:         pj  = b->j + bdiag[i+4]+1;
1794:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1795:         for (j=0; j<nz; j++) {
1796:           col    = pj[j];
1797:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1798:         }

1800:         sctx.rs = rs;
1801:         sctx.pv = rtmp4[i+3];
1802:         MatPivotCheck(B,A,info,&sctx,i+3);
1803:         if (sctx.newshift) break;
1804:         pc4  = b->a + bdiag[i+3];
1805:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1806:         break;

1808:       default:
1809:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1810:       }
1811:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1812:       i += nodesz;                 /* Update the row */
1813:     }

1815:     /* MatPivotRefine() */
1816:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1817:       /*
1818:        * if no shift in this attempt & shifting & started shifting & can refine,
1819:        * then try lower shift
1820:        */
1821:       sctx.shift_hi       = sctx.shift_fraction;
1822:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1823:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1824:       sctx.newshift       = PETSC_TRUE;
1825:       sctx.nshift++;
1826:     }
1827:   } while (sctx.newshift);

1829:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1830:   PetscFree(tmp_vec2);
1831:   ISRestoreIndices(isicol,&ic);
1832:   ISRestoreIndices(isrow,&r);

1834:   if (b->inode.size) {
1835:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1836:   } else {
1837:     C->ops->solve           = MatSolve_SeqAIJ;
1838:   }
1839:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1840:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1841:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1842:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1843:   C->assembled              = PETSC_TRUE;
1844:   C->preallocated           = PETSC_TRUE;

1846:   PetscLogFlops(C->cmap->n);

1848:   /* MatShiftView(A,info,&sctx) */
1849:   if (sctx.nshift) {
1850:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1851:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1852:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1853:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1854:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1855:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1856:     }
1857:   }
1858:   return(0);
1859: }

1861: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1862: {
1863:   Mat             C     = B;
1864:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1865:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1866:   PetscErrorCode  ierr;
1867:   const PetscInt  *r,*ic,*c,*ics;
1868:   PetscInt        n   = A->rmap->n,*bi = b->i;
1869:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1870:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1871:   PetscInt        *ai = a->i,*aj = a->j;
1872:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1873:   PetscScalar     mul1,mul2,mul3,tmp;
1874:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1875:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1876:   PetscReal       rs=0.0;
1877:   FactorShiftCtx  sctx;

1880:   sctx.shift_top      = 0;
1881:   sctx.nshift_max     = 0;
1882:   sctx.shift_lo       = 0;
1883:   sctx.shift_hi       = 0;
1884:   sctx.shift_fraction = 0;

1886:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1887:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1888:     sctx.shift_top = 0;
1889:     for (i=0; i<n; i++) {
1890:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1891:       rs    = 0.0;
1892:       ajtmp = aj + ai[i];
1893:       rtmp1 = aa + ai[i];
1894:       nz    = ai[i+1] - ai[i];
1895:       for (j=0; j<nz; j++) {
1896:         if (*ajtmp != i) {
1897:           rs += PetscAbsScalar(*rtmp1++);
1898:         } else {
1899:           rs -= PetscRealPart(*rtmp1++);
1900:         }
1901:         ajtmp++;
1902:       }
1903:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1904:     }
1905:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1906:     sctx.shift_top *= 1.1;
1907:     sctx.nshift_max = 5;
1908:     sctx.shift_lo   = 0.;
1909:     sctx.shift_hi   = 1.;
1910:   }
1911:   sctx.shift_amount = 0;
1912:   sctx.nshift       = 0;

1914:   ISGetIndices(isrow,&r);
1915:   ISGetIndices(iscol,&c);
1916:   ISGetIndices(isicol,&ic);
1917:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1918:   ics    = ic;

1920:   node_max = a->inode.node_count;
1921:   ns       = a->inode.size;
1922:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1924:   /* If max inode size > 3, split it into two inodes.*/
1925:   /* also map the inode sizes according to the ordering */
1926:   PetscMalloc1(n+1,&tmp_vec1);
1927:   for (i=0,j=0; i<node_max; ++i,++j) {
1928:     if (ns[i]>3) {
1929:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1930:       ++j;
1931:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1932:     } else {
1933:       tmp_vec1[j] = ns[i];
1934:     }
1935:   }
1936:   /* Use the correct node_max */
1937:   node_max = j;

1939:   /* Now reorder the inode info based on mat re-ordering info */
1940:   /* First create a row -> inode_size_array_index map */
1941:   PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1942:   for (i=0,row=0; i<node_max; i++) {
1943:     nodesz = tmp_vec1[i];
1944:     for (j=0; j<nodesz; j++,row++) {
1945:       nsmap[row] = i;
1946:     }
1947:   }
1948:   /* Using nsmap, create a reordered ns structure */
1949:   for (i=0,j=0; i< node_max; i++) {
1950:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1951:     tmp_vec2[i] = nodesz;
1952:     j          += nodesz;
1953:   }
1954:   PetscFree2(nsmap,tmp_vec1);
1955:   /* Now use the correct ns */
1956:   ns = tmp_vec2;

1958:   do {
1959:     sctx.newshift = PETSC_FALSE;
1960:     /* Now loop over each block-row, and do the factorization */
1961:     for (i=0,row=0; i<node_max; i++) {
1962:       nodesz = ns[i];
1963:       nz     = bi[row+1] - bi[row];
1964:       bjtmp  = bj + bi[row];

1966:       switch (nodesz) {
1967:       case 1:
1968:         for  (j=0; j<nz; j++) {
1969:           idx         = bjtmp[j];
1970:           rtmp11[idx] = 0.0;
1971:         }

1973:         /* load in initial (unfactored row) */
1974:         idx    = r[row];
1975:         nz_tmp = ai[idx+1] - ai[idx];
1976:         ajtmp  = aj + ai[idx];
1977:         v1     = aa + ai[idx];

1979:         for (j=0; j<nz_tmp; j++) {
1980:           idx         = ics[ajtmp[j]];
1981:           rtmp11[idx] = v1[j];
1982:         }
1983:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1985:         prow = *bjtmp++;
1986:         while (prow < row) {
1987:           pc1 = rtmp11 + prow;
1988:           if (*pc1 != 0.0) {
1989:             pv     = ba + bd[prow];
1990:             pj     = nbj + bd[prow];
1991:             mul1   = *pc1 * *pv++;
1992:             *pc1   = mul1;
1993:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1994:             PetscLogFlops(1+2*nz_tmp);
1995:             for (j=0; j<nz_tmp; j++) {
1996:               tmp          = pv[j];
1997:               idx          = pj[j];
1998:               rtmp11[idx] -= mul1 * tmp;
1999:             }
2000:           }
2001:           prow = *bjtmp++;
2002:         }
2003:         pj  = bj + bi[row];
2004:         pc1 = ba + bi[row];

2006:         sctx.pv     = rtmp11[row];
2007:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2008:         rs          = 0.0;
2009:         for (j=0; j<nz; j++) {
2010:           idx    = pj[j];
2011:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2012:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2013:         }
2014:         sctx.rs = rs;
2015:         MatPivotCheck(B,A,info,&sctx,row);
2016:         if (sctx.newshift) goto endofwhile;
2017:         break;

2019:       case 2:
2020:         for (j=0; j<nz; j++) {
2021:           idx         = bjtmp[j];
2022:           rtmp11[idx] = 0.0;
2023:           rtmp22[idx] = 0.0;
2024:         }

2026:         /* load in initial (unfactored row) */
2027:         idx    = r[row];
2028:         nz_tmp = ai[idx+1] - ai[idx];
2029:         ajtmp  = aj + ai[idx];
2030:         v1     = aa + ai[idx];
2031:         v2     = aa + ai[idx+1];
2032:         for (j=0; j<nz_tmp; j++) {
2033:           idx         = ics[ajtmp[j]];
2034:           rtmp11[idx] = v1[j];
2035:           rtmp22[idx] = v2[j];
2036:         }
2037:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2038:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2040:         prow = *bjtmp++;
2041:         while (prow < row) {
2042:           pc1 = rtmp11 + prow;
2043:           pc2 = rtmp22 + prow;
2044:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2045:             pv   = ba + bd[prow];
2046:             pj   = nbj + bd[prow];
2047:             mul1 = *pc1 * *pv;
2048:             mul2 = *pc2 * *pv;
2049:             ++pv;
2050:             *pc1 = mul1;
2051:             *pc2 = mul2;

2053:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2054:             for (j=0; j<nz_tmp; j++) {
2055:               tmp          = pv[j];
2056:               idx          = pj[j];
2057:               rtmp11[idx] -= mul1 * tmp;
2058:               rtmp22[idx] -= mul2 * tmp;
2059:             }
2060:             PetscLogFlops(2+4*nz_tmp);
2061:           }
2062:           prow = *bjtmp++;
2063:         }

2065:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2066:         pc1 = rtmp11 + prow;
2067:         pc2 = rtmp22 + prow;

2069:         sctx.pv = *pc1;
2070:         pj      = bj + bi[prow];
2071:         rs      = 0.0;
2072:         for (j=0; j<nz; j++) {
2073:           idx = pj[j];
2074:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2075:         }
2076:         sctx.rs = rs;
2077:         MatPivotCheck(B,A,info,&sctx,row);
2078:         if (sctx.newshift) goto endofwhile;

2080:         if (*pc2 != 0.0) {
2081:           pj     = nbj + bd[prow];
2082:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2083:           *pc2   = mul2;
2084:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2085:           for (j=0; j<nz_tmp; j++) {
2086:             idx          = pj[j];
2087:             tmp          = rtmp11[idx];
2088:             rtmp22[idx] -= mul2 * tmp;
2089:           }
2090:           PetscLogFlops(1+2*nz_tmp);
2091:         }

2093:         pj  = bj + bi[row];
2094:         pc1 = ba + bi[row];
2095:         pc2 = ba + bi[row+1];

2097:         sctx.pv       = rtmp22[row+1];
2098:         rs            = 0.0;
2099:         rtmp11[row]   = 1.0/rtmp11[row];
2100:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2101:         /* copy row entries from dense representation to sparse */
2102:         for (j=0; j<nz; j++) {
2103:           idx    = pj[j];
2104:           pc1[j] = rtmp11[idx];
2105:           pc2[j] = rtmp22[idx];
2106:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2107:         }
2108:         sctx.rs = rs;
2109:         MatPivotCheck(B,A,info,&sctx,row+1);
2110:         if (sctx.newshift) goto endofwhile;
2111:         break;

2113:       case 3:
2114:         for  (j=0; j<nz; j++) {
2115:           idx         = bjtmp[j];
2116:           rtmp11[idx] = 0.0;
2117:           rtmp22[idx] = 0.0;
2118:           rtmp33[idx] = 0.0;
2119:         }
2120:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2121:         idx    = r[row];
2122:         nz_tmp = ai[idx+1] - ai[idx];
2123:         ajtmp  = aj + ai[idx];
2124:         v1     = aa + ai[idx];
2125:         v2     = aa + ai[idx+1];
2126:         v3     = aa + ai[idx+2];
2127:         for (j=0; j<nz_tmp; j++) {
2128:           idx         = ics[ajtmp[j]];
2129:           rtmp11[idx] = v1[j];
2130:           rtmp22[idx] = v2[j];
2131:           rtmp33[idx] = v3[j];
2132:         }
2133:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2134:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2135:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2137:         /* loop over all pivot row blocks above this row block */
2138:         prow = *bjtmp++;
2139:         while (prow < row) {
2140:           pc1 = rtmp11 + prow;
2141:           pc2 = rtmp22 + prow;
2142:           pc3 = rtmp33 + prow;
2143:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2144:             pv   = ba  + bd[prow];
2145:             pj   = nbj + bd[prow];
2146:             mul1 = *pc1 * *pv;
2147:             mul2 = *pc2 * *pv;
2148:             mul3 = *pc3 * *pv;
2149:             ++pv;
2150:             *pc1 = mul1;
2151:             *pc2 = mul2;
2152:             *pc3 = mul3;

2154:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2155:             /* update this row based on pivot row */
2156:             for (j=0; j<nz_tmp; j++) {
2157:               tmp          = pv[j];
2158:               idx          = pj[j];
2159:               rtmp11[idx] -= mul1 * tmp;
2160:               rtmp22[idx] -= mul2 * tmp;
2161:               rtmp33[idx] -= mul3 * tmp;
2162:             }
2163:             PetscLogFlops(3+6*nz_tmp);
2164:           }
2165:           prow = *bjtmp++;
2166:         }

2168:         /* Now take care of diagonal 3x3 block in this set of rows */
2169:         /* note: prow = row here */
2170:         pc1 = rtmp11 + prow;
2171:         pc2 = rtmp22 + prow;
2172:         pc3 = rtmp33 + prow;

2174:         sctx.pv = *pc1;
2175:         pj      = bj + bi[prow];
2176:         rs      = 0.0;
2177:         for (j=0; j<nz; j++) {
2178:           idx = pj[j];
2179:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2180:         }
2181:         sctx.rs = rs;
2182:         MatPivotCheck(B,A,info,&sctx,row);
2183:         if (sctx.newshift) goto endofwhile;

2185:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2186:           mul2   = (*pc2)/(*pc1);
2187:           mul3   = (*pc3)/(*pc1);
2188:           *pc2   = mul2;
2189:           *pc3   = mul3;
2190:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2191:           pj     = nbj + bd[prow];
2192:           for (j=0; j<nz_tmp; j++) {
2193:             idx          = pj[j];
2194:             tmp          = rtmp11[idx];
2195:             rtmp22[idx] -= mul2 * tmp;
2196:             rtmp33[idx] -= mul3 * tmp;
2197:           }
2198:           PetscLogFlops(2+4*nz_tmp);
2199:         }
2200:         ++prow;

2202:         pc2     = rtmp22 + prow;
2203:         pc3     = rtmp33 + prow;
2204:         sctx.pv = *pc2;
2205:         pj      = bj + bi[prow];
2206:         rs      = 0.0;
2207:         for (j=0; j<nz; j++) {
2208:           idx = pj[j];
2209:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2210:         }
2211:         sctx.rs = rs;
2212:         MatPivotCheck(B,A,info,&sctx,row+1);
2213:         if (sctx.newshift) goto endofwhile;

2215:         if (*pc3 != 0.0) {
2216:           mul3   = (*pc3)/(*pc2);
2217:           *pc3   = mul3;
2218:           pj     = nbj + bd[prow];
2219:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2220:           for (j=0; j<nz_tmp; j++) {
2221:             idx          = pj[j];
2222:             tmp          = rtmp22[idx];
2223:             rtmp33[idx] -= mul3 * tmp;
2224:           }
2225:           PetscLogFlops(1+2*nz_tmp);
2226:         }

2228:         pj  = bj + bi[row];
2229:         pc1 = ba + bi[row];
2230:         pc2 = ba + bi[row+1];
2231:         pc3 = ba + bi[row+2];

2233:         sctx.pv       = rtmp33[row+2];
2234:         rs            = 0.0;
2235:         rtmp11[row]   = 1.0/rtmp11[row];
2236:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2237:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2238:         /* copy row entries from dense representation to sparse */
2239:         for (j=0; j<nz; j++) {
2240:           idx    = pj[j];
2241:           pc1[j] = rtmp11[idx];
2242:           pc2[j] = rtmp22[idx];
2243:           pc3[j] = rtmp33[idx];
2244:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2245:         }

2247:         sctx.rs = rs;
2248:         MatPivotCheck(B,A,info,&sctx,row+2);
2249:         if (sctx.newshift) goto endofwhile;
2250:         break;

2252:       default:
2253:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2254:       }
2255:       row += nodesz;                 /* Update the row */
2256:     }
2257: endofwhile:;
2258:   } while (sctx.newshift);
2259:   PetscFree3(rtmp11,rtmp22,rtmp33);
2260:   PetscFree(tmp_vec2);
2261:   ISRestoreIndices(isicol,&ic);
2262:   ISRestoreIndices(isrow,&r);
2263:   ISRestoreIndices(iscol,&c);

2265:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2266:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2267:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2268:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2269:   C->assembled              = PETSC_TRUE;
2270:   C->preallocated           = PETSC_TRUE;
2271:   if (sctx.nshift) {
2272:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2273:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2274:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2275:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2276:     }
2277:   }
2278:   PetscLogFlops(C->cmap->n);
2279:   MatSeqAIJCheckInode(C);
2280:   return(0);
2281: }


2284: /* ----------------------------------------------------------- */
2285: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2286: {
2287:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2288:   IS                iscol = a->col,isrow = a->row;
2289:   PetscErrorCode    ierr;
2290:   const PetscInt    *r,*c,*rout,*cout;
2291:   PetscInt          i,j,n = A->rmap->n;
2292:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2293:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2294:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2295:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2296:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2297:   const PetscScalar *b;

2300:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2301:   node_max = a->inode.node_count;
2302:   ns       = a->inode.size;     /* Node Size array */

2304:   VecGetArrayRead(bb,&b);
2305:   VecGetArrayWrite(xx,&x);
2306:   tmp  = a->solve_work;

2308:   ISGetIndices(isrow,&rout); r = rout;
2309:   ISGetIndices(iscol,&cout); c = cout;

2311:   /* forward solve the lower triangular */
2312:   tmps = tmp;
2313:   aa   = a_a;
2314:   aj   = a_j;
2315:   ad   = a->diag;

2317:   for (i = 0,row = 0; i< node_max; ++i) {
2318:     nsz = ns[i];
2319:     aii = ai[row];
2320:     v1  = aa + aii;
2321:     vi  = aj + aii;
2322:     nz  = ai[row+1]- ai[row];

2324:     if (i < node_max-1) {
2325:       /* Prefetch the indices for the next block */
2326:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2327:       /* Prefetch the data for the next block */
2328:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2329:     }

2331:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2332:     case 1:
2333:       sum1 = b[r[row]];
2334:       for (j=0; j<nz-1; j+=2) {
2335:         i0    = vi[j];
2336:         i1    = vi[j+1];
2337:         tmp0  = tmps[i0];
2338:         tmp1  = tmps[i1];
2339:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2340:       }
2341:       if (j == nz-1) {
2342:         tmp0  = tmps[vi[j]];
2343:         sum1 -= v1[j]*tmp0;
2344:       }
2345:       tmp[row++]=sum1;
2346:       break;
2347:     case 2:
2348:       sum1 = b[r[row]];
2349:       sum2 = b[r[row+1]];
2350:       v2   = aa + ai[row+1];

2352:       for (j=0; j<nz-1; j+=2) {
2353:         i0    = vi[j];
2354:         i1    = vi[j+1];
2355:         tmp0  = tmps[i0];
2356:         tmp1  = tmps[i1];
2357:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2358:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2359:       }
2360:       if (j == nz-1) {
2361:         tmp0  = tmps[vi[j]];
2362:         sum1 -= v1[j] *tmp0;
2363:         sum2 -= v2[j] *tmp0;
2364:       }
2365:       sum2     -= v2[nz] * sum1;
2366:       tmp[row++]=sum1;
2367:       tmp[row++]=sum2;
2368:       break;
2369:     case 3:
2370:       sum1 = b[r[row]];
2371:       sum2 = b[r[row+1]];
2372:       sum3 = b[r[row+2]];
2373:       v2   = aa + ai[row+1];
2374:       v3   = aa + ai[row+2];

2376:       for (j=0; j<nz-1; j+=2) {
2377:         i0    = vi[j];
2378:         i1    = vi[j+1];
2379:         tmp0  = tmps[i0];
2380:         tmp1  = tmps[i1];
2381:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2382:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2383:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2384:       }
2385:       if (j == nz-1) {
2386:         tmp0  = tmps[vi[j]];
2387:         sum1 -= v1[j] *tmp0;
2388:         sum2 -= v2[j] *tmp0;
2389:         sum3 -= v3[j] *tmp0;
2390:       }
2391:       sum2     -= v2[nz] * sum1;
2392:       sum3     -= v3[nz] * sum1;
2393:       sum3     -= v3[nz+1] * sum2;
2394:       tmp[row++]=sum1;
2395:       tmp[row++]=sum2;
2396:       tmp[row++]=sum3;
2397:       break;

2399:     case 4:
2400:       sum1 = b[r[row]];
2401:       sum2 = b[r[row+1]];
2402:       sum3 = b[r[row+2]];
2403:       sum4 = b[r[row+3]];
2404:       v2   = aa + ai[row+1];
2405:       v3   = aa + ai[row+2];
2406:       v4   = aa + ai[row+3];

2408:       for (j=0; j<nz-1; j+=2) {
2409:         i0    = vi[j];
2410:         i1    = vi[j+1];
2411:         tmp0  = tmps[i0];
2412:         tmp1  = tmps[i1];
2413:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2414:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2415:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2416:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2417:       }
2418:       if (j == nz-1) {
2419:         tmp0  = tmps[vi[j]];
2420:         sum1 -= v1[j] *tmp0;
2421:         sum2 -= v2[j] *tmp0;
2422:         sum3 -= v3[j] *tmp0;
2423:         sum4 -= v4[j] *tmp0;
2424:       }
2425:       sum2 -= v2[nz] * sum1;
2426:       sum3 -= v3[nz] * sum1;
2427:       sum4 -= v4[nz] * sum1;
2428:       sum3 -= v3[nz+1] * sum2;
2429:       sum4 -= v4[nz+1] * sum2;
2430:       sum4 -= v4[nz+2] * sum3;

2432:       tmp[row++]=sum1;
2433:       tmp[row++]=sum2;
2434:       tmp[row++]=sum3;
2435:       tmp[row++]=sum4;
2436:       break;
2437:     case 5:
2438:       sum1 = b[r[row]];
2439:       sum2 = b[r[row+1]];
2440:       sum3 = b[r[row+2]];
2441:       sum4 = b[r[row+3]];
2442:       sum5 = b[r[row+4]];
2443:       v2   = aa + ai[row+1];
2444:       v3   = aa + ai[row+2];
2445:       v4   = aa + ai[row+3];
2446:       v5   = aa + ai[row+4];

2448:       for (j=0; j<nz-1; j+=2) {
2449:         i0    = vi[j];
2450:         i1    = vi[j+1];
2451:         tmp0  = tmps[i0];
2452:         tmp1  = tmps[i1];
2453:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2454:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2455:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2456:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2457:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2458:       }
2459:       if (j == nz-1) {
2460:         tmp0  = tmps[vi[j]];
2461:         sum1 -= v1[j] *tmp0;
2462:         sum2 -= v2[j] *tmp0;
2463:         sum3 -= v3[j] *tmp0;
2464:         sum4 -= v4[j] *tmp0;
2465:         sum5 -= v5[j] *tmp0;
2466:       }

2468:       sum2 -= v2[nz] * sum1;
2469:       sum3 -= v3[nz] * sum1;
2470:       sum4 -= v4[nz] * sum1;
2471:       sum5 -= v5[nz] * sum1;
2472:       sum3 -= v3[nz+1] * sum2;
2473:       sum4 -= v4[nz+1] * sum2;
2474:       sum5 -= v5[nz+1] * sum2;
2475:       sum4 -= v4[nz+2] * sum3;
2476:       sum5 -= v5[nz+2] * sum3;
2477:       sum5 -= v5[nz+3] * sum4;

2479:       tmp[row++]=sum1;
2480:       tmp[row++]=sum2;
2481:       tmp[row++]=sum3;
2482:       tmp[row++]=sum4;
2483:       tmp[row++]=sum5;
2484:       break;
2485:     default:
2486:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2487:     }
2488:   }
2489:   /* backward solve the upper triangular */
2490:   for (i=node_max -1,row = n-1; i>=0; i--) {
2491:     nsz = ns[i];
2492:     aii = ad[row+1] + 1;
2493:     v1  = aa + aii;
2494:     vi  = aj + aii;
2495:     nz  = ad[row]- ad[row+1] - 1;

2497:     if (i > 0) {
2498:       /* Prefetch the indices for the next block */
2499:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2500:       /* Prefetch the data for the next block */
2501:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502:     }

2504:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2505:     case 1:
2506:       sum1 = tmp[row];

2508:       for (j=0; j<nz-1; j+=2) {
2509:         i0    = vi[j];
2510:         i1    = vi[j+1];
2511:         tmp0  = tmps[i0];
2512:         tmp1  = tmps[i1];
2513:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2514:       }
2515:       if (j == nz-1) {
2516:         tmp0  = tmps[vi[j]];
2517:         sum1 -= v1[j]*tmp0;
2518:       }
2519:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2520:       break;
2521:     case 2:
2522:       sum1 = tmp[row];
2523:       sum2 = tmp[row-1];
2524:       v2   = aa + ad[row] + 1;
2525:       for (j=0; j<nz-1; j+=2) {
2526:         i0    = vi[j];
2527:         i1    = vi[j+1];
2528:         tmp0  = tmps[i0];
2529:         tmp1  = tmps[i1];
2530:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2531:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2532:       }
2533:       if (j == nz-1) {
2534:         tmp0  = tmps[vi[j]];
2535:         sum1 -= v1[j]* tmp0;
2536:         sum2 -= v2[j+1]* tmp0;
2537:       }

2539:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2540:       sum2     -= v2[0] * tmp0;
2541:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2542:       break;
2543:     case 3:
2544:       sum1 = tmp[row];
2545:       sum2 = tmp[row -1];
2546:       sum3 = tmp[row -2];
2547:       v2   = aa + ad[row] + 1;
2548:       v3   = aa + ad[row -1] + 1;
2549:       for (j=0; j<nz-1; j+=2) {
2550:         i0    = vi[j];
2551:         i1    = vi[j+1];
2552:         tmp0  = tmps[i0];
2553:         tmp1  = tmps[i1];
2554:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2555:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2556:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2557:       }
2558:       if (j== nz-1) {
2559:         tmp0  = tmps[vi[j]];
2560:         sum1 -= v1[j] * tmp0;
2561:         sum2 -= v2[j+1] * tmp0;
2562:         sum3 -= v3[j+2] * tmp0;
2563:       }
2564:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565:       sum2     -= v2[0]* tmp0;
2566:       sum3     -= v3[1] * tmp0;
2567:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2568:       sum3     -= v3[0]* tmp0;
2569:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2571:       break;
2572:     case 4:
2573:       sum1 = tmp[row];
2574:       sum2 = tmp[row -1];
2575:       sum3 = tmp[row -2];
2576:       sum4 = tmp[row -3];
2577:       v2   = aa + ad[row]+1;
2578:       v3   = aa + ad[row -1]+1;
2579:       v4   = aa + ad[row -2]+1;

2581:       for (j=0; j<nz-1; j+=2) {
2582:         i0    = vi[j];
2583:         i1    = vi[j+1];
2584:         tmp0  = tmps[i0];
2585:         tmp1  = tmps[i1];
2586:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2587:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2588:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2589:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2590:       }
2591:       if (j== nz-1) {
2592:         tmp0  = tmps[vi[j]];
2593:         sum1 -= v1[j] * tmp0;
2594:         sum2 -= v2[j+1] * tmp0;
2595:         sum3 -= v3[j+2] * tmp0;
2596:         sum4 -= v4[j+3] * tmp0;
2597:       }

2599:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600:       sum2     -= v2[0] * tmp0;
2601:       sum3     -= v3[1] * tmp0;
2602:       sum4     -= v4[2] * tmp0;
2603:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2604:       sum3     -= v3[0] * tmp0;
2605:       sum4     -= v4[1] * tmp0;
2606:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2607:       sum4     -= v4[0] * tmp0;
2608:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2609:       break;
2610:     case 5:
2611:       sum1 = tmp[row];
2612:       sum2 = tmp[row -1];
2613:       sum3 = tmp[row -2];
2614:       sum4 = tmp[row -3];
2615:       sum5 = tmp[row -4];
2616:       v2   = aa + ad[row]+1;
2617:       v3   = aa + ad[row -1]+1;
2618:       v4   = aa + ad[row -2]+1;
2619:       v5   = aa + ad[row -3]+1;
2620:       for (j=0; j<nz-1; j+=2) {
2621:         i0    = vi[j];
2622:         i1    = vi[j+1];
2623:         tmp0  = tmps[i0];
2624:         tmp1  = tmps[i1];
2625:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2626:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2630:       }
2631:       if (j==nz-1) {
2632:         tmp0  = tmps[vi[j]];
2633:         sum1 -= v1[j] * tmp0;
2634:         sum2 -= v2[j+1] * tmp0;
2635:         sum3 -= v3[j+2] * tmp0;
2636:         sum4 -= v4[j+3] * tmp0;
2637:         sum5 -= v5[j+4] * tmp0;
2638:       }

2640:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2641:       sum2     -= v2[0] * tmp0;
2642:       sum3     -= v3[1] * tmp0;
2643:       sum4     -= v4[2] * tmp0;
2644:       sum5     -= v5[3] * tmp0;
2645:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2646:       sum3     -= v3[0] * tmp0;
2647:       sum4     -= v4[1] * tmp0;
2648:       sum5     -= v5[2] * tmp0;
2649:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650:       sum4     -= v4[0] * tmp0;
2651:       sum5     -= v5[1] * tmp0;
2652:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2653:       sum5     -= v5[0] * tmp0;
2654:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2655:       break;
2656:     default:
2657:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2658:     }
2659:   }
2660:   ISRestoreIndices(isrow,&rout);
2661:   ISRestoreIndices(iscol,&cout);
2662:   VecRestoreArrayRead(bb,&b);
2663:   VecRestoreArrayWrite(xx,&x);
2664:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2665:   return(0);
2666: }


2669: /*
2670:      Makes a longer coloring[] array and calls the usual code with that
2671: */
2672: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2673: {
2674:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2675:   PetscErrorCode  ierr;
2676:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2677:   PetscInt        *colorused,i;
2678:   ISColoringValue *newcolor;

2681:   PetscMalloc1(n+1,&newcolor);
2682:   /* loop over inodes, marking a color for each column*/
2683:   row = 0;
2684:   for (i=0; i<m; i++) {
2685:     for (j=0; j<ns[i]; j++) {
2686:       newcolor[row++] = coloring[i] + j*ncolors;
2687:     }
2688:   }

2690:   /* eliminate unneeded colors */
2691:   PetscCalloc1(5*ncolors,&colorused);
2692:   for (i=0; i<n; i++) {
2693:     colorused[newcolor[i]] = 1;
2694:   }

2696:   for (i=1; i<5*ncolors; i++) {
2697:     colorused[i] += colorused[i-1];
2698:   }
2699:   ncolors = colorused[5*ncolors-1];
2700:   for (i=0; i<n; i++) {
2701:     newcolor[i] = colorused[newcolor[i]]-1;
2702:   }
2703:   PetscFree(colorused);
2704:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2705:   PetscFree(coloring);
2706:   return(0);
2707: }

2709:  #include <petsc/private/kernels/blockinvert.h>

2711: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2712: {
2713:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2714:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2715:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2716:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2717:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2718:   const PetscScalar *xb, *b;
2719:   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2720:   PetscErrorCode    ierr;
2721:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2722:   PetscInt          sz,k,ipvt[5];
2723:   PetscBool         allowzeropivot,zeropivotdetected;
2724:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2727:   allowzeropivot = PetscNot(A->erroriffailure);
2728:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2729:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2731:   if (!a->inode.ibdiagvalid) {
2732:     if (!a->inode.ibdiag) {
2733:       /* calculate space needed for diagonal blocks */
2734:       for (i=0; i<m; i++) {
2735:         cnt += sizes[i]*sizes[i];
2736:       }
2737:       a->inode.bdiagsize = cnt;

2739:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2740:     }

2742:     /* copy over the diagonal blocks and invert them */
2743:     ibdiag = a->inode.ibdiag;
2744:     bdiag  = a->inode.bdiag;
2745:     cnt    = 0;
2746:     for (i=0, row = 0; i<m; i++) {
2747:       for (j=0; j<sizes[i]; j++) {
2748:         for (k=0; k<sizes[i]; k++) {
2749:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2750:         }
2751:       }
2752:       PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);

2754:       switch (sizes[i]) {
2755:       case 1:
2756:         /* Create matrix data structure */
2757:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2758:           if (allowzeropivot) {
2759:             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2760:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2761:             A->factorerror_zeropivot_row   = row;
2762:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2763:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2764:         }
2765:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2766:         break;
2767:       case 2:
2768:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2769:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2770:         break;
2771:       case 3:
2772:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2773:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2774:         break;
2775:       case 4:
2776:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2777:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2778:         break;
2779:       case 5:
2780:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2781:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782:         break;
2783:       default:
2784:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2785:       }
2786:       cnt += sizes[i]*sizes[i];
2787:       row += sizes[i];
2788:     }
2789:     a->inode.ibdiagvalid = PETSC_TRUE;
2790:   }
2791:   ibdiag = a->inode.ibdiag;
2792:   bdiag  = a->inode.bdiag;
2793:   t      = a->inode.ssor_work;

2795:   VecGetArray(xx,&x);
2796:   VecGetArrayRead(bb,&b);
2797:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2798:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2799:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2801:       for (i=0, row=0; i<m; i++) {
2802:         sz  = diag[row] - ii[row];
2803:         v1  = a->a + ii[row];
2804:         idx = a->j + ii[row];

2806:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2807:         switch (sizes[i]) {
2808:         case 1:

2810:           sum1 = b[row];
2811:           for (n = 0; n<sz-1; n+=2) {
2812:             i1    = idx[0];
2813:             i2    = idx[1];
2814:             idx  += 2;
2815:             tmp0  = x[i1];
2816:             tmp1  = x[i2];
2817:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2818:           }

2820:           if (n == sz-1) {
2821:             tmp0  = x[*idx];
2822:             sum1 -= *v1 * tmp0;
2823:           }
2824:           t[row]   = sum1;
2825:           x[row++] = sum1*(*ibdiag++);
2826:           break;
2827:         case 2:
2828:           v2   = a->a + ii[row+1];
2829:           sum1 = b[row];
2830:           sum2 = b[row+1];
2831:           for (n = 0; n<sz-1; n+=2) {
2832:             i1    = idx[0];
2833:             i2    = idx[1];
2834:             idx  += 2;
2835:             tmp0  = x[i1];
2836:             tmp1  = x[i2];
2837:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2838:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2839:           }

2841:           if (n == sz-1) {
2842:             tmp0  = x[*idx];
2843:             sum1 -= v1[0] * tmp0;
2844:             sum2 -= v2[0] * tmp0;
2845:           }
2846:           t[row]   = sum1;
2847:           t[row+1] = sum2;
2848:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2849:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2850:           ibdiag  += 4;
2851:           break;
2852:         case 3:
2853:           v2   = a->a + ii[row+1];
2854:           v3   = a->a + ii[row+2];
2855:           sum1 = b[row];
2856:           sum2 = b[row+1];
2857:           sum3 = b[row+2];
2858:           for (n = 0; n<sz-1; n+=2) {
2859:             i1    = idx[0];
2860:             i2    = idx[1];
2861:             idx  += 2;
2862:             tmp0  = x[i1];
2863:             tmp1  = x[i2];
2864:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2866:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2867:           }

2869:           if (n == sz-1) {
2870:             tmp0  = x[*idx];
2871:             sum1 -= v1[0] * tmp0;
2872:             sum2 -= v2[0] * tmp0;
2873:             sum3 -= v3[0] * tmp0;
2874:           }
2875:           t[row]   = sum1;
2876:           t[row+1] = sum2;
2877:           t[row+2] = sum3;
2878:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2879:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2880:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2881:           ibdiag  += 9;
2882:           break;
2883:         case 4:
2884:           v2   = a->a + ii[row+1];
2885:           v3   = a->a + ii[row+2];
2886:           v4   = a->a + ii[row+3];
2887:           sum1 = b[row];
2888:           sum2 = b[row+1];
2889:           sum3 = b[row+2];
2890:           sum4 = b[row+3];
2891:           for (n = 0; n<sz-1; n+=2) {
2892:             i1    = idx[0];
2893:             i2    = idx[1];
2894:             idx  += 2;
2895:             tmp0  = x[i1];
2896:             tmp1  = x[i2];
2897:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2898:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2899:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2900:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2901:           }

2903:           if (n == sz-1) {
2904:             tmp0  = x[*idx];
2905:             sum1 -= v1[0] * tmp0;
2906:             sum2 -= v2[0] * tmp0;
2907:             sum3 -= v3[0] * tmp0;
2908:             sum4 -= v4[0] * tmp0;
2909:           }
2910:           t[row]   = sum1;
2911:           t[row+1] = sum2;
2912:           t[row+2] = sum3;
2913:           t[row+3] = sum4;
2914:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2915:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2916:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2917:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2918:           ibdiag  += 16;
2919:           break;
2920:         case 5:
2921:           v2   = a->a + ii[row+1];
2922:           v3   = a->a + ii[row+2];
2923:           v4   = a->a + ii[row+3];
2924:           v5   = a->a + ii[row+4];
2925:           sum1 = b[row];
2926:           sum2 = b[row+1];
2927:           sum3 = b[row+2];
2928:           sum4 = b[row+3];
2929:           sum5 = b[row+4];
2930:           for (n = 0; n<sz-1; n+=2) {
2931:             i1    = idx[0];
2932:             i2    = idx[1];
2933:             idx  += 2;
2934:             tmp0  = x[i1];
2935:             tmp1  = x[i2];
2936:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2937:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2938:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2939:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2940:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2941:           }

2943:           if (n == sz-1) {
2944:             tmp0  = x[*idx];
2945:             sum1 -= v1[0] * tmp0;
2946:             sum2 -= v2[0] * tmp0;
2947:             sum3 -= v3[0] * tmp0;
2948:             sum4 -= v4[0] * tmp0;
2949:             sum5 -= v5[0] * tmp0;
2950:           }
2951:           t[row]   = sum1;
2952:           t[row+1] = sum2;
2953:           t[row+2] = sum3;
2954:           t[row+3] = sum4;
2955:           t[row+4] = sum5;
2956:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2957:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2958:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2959:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2960:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2961:           ibdiag  += 25;
2962:           break;
2963:         default:
2964:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2965:         }
2966:       }

2968:       xb   = t;
2969:       PetscLogFlops(a->nz);
2970:     } else xb = b;
2971:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

2973:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2974:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2975:         ibdiag -= sizes[i]*sizes[i];
2976:         sz      = ii[row+1] - diag[row] - 1;
2977:         v1      = a->a + diag[row] + 1;
2978:         idx     = a->j + diag[row] + 1;

2980:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2981:         switch (sizes[i]) {
2982:         case 1:

2984:           sum1 = xb[row];
2985:           for (n = 0; n<sz-1; n+=2) {
2986:             i1    = idx[0];
2987:             i2    = idx[1];
2988:             idx  += 2;
2989:             tmp0  = x[i1];
2990:             tmp1  = x[i2];
2991:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2992:           }

2994:           if (n == sz-1) {
2995:             tmp0  = x[*idx];
2996:             sum1 -= *v1*tmp0;
2997:           }
2998:           x[row--] = sum1*(*ibdiag);
2999:           break;

3001:         case 2:

3003:           sum1 = xb[row];
3004:           sum2 = xb[row-1];
3005:           /* note that sum1 is associated with the second of the two rows */
3006:           v2 = a->a + diag[row-1] + 2;
3007:           for (n = 0; n<sz-1; n+=2) {
3008:             i1    = idx[0];
3009:             i2    = idx[1];
3010:             idx  += 2;
3011:             tmp0  = x[i1];
3012:             tmp1  = x[i2];
3013:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3014:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3015:           }

3017:           if (n == sz-1) {
3018:             tmp0  = x[*idx];
3019:             sum1 -= *v1*tmp0;
3020:             sum2 -= *v2*tmp0;
3021:           }
3022:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3023:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3024:           break;
3025:         case 3:

3027:           sum1 = xb[row];
3028:           sum2 = xb[row-1];
3029:           sum3 = xb[row-2];
3030:           v2   = a->a + diag[row-1] + 2;
3031:           v3   = a->a + diag[row-2] + 3;
3032:           for (n = 0; n<sz-1; n+=2) {
3033:             i1    = idx[0];
3034:             i2    = idx[1];
3035:             idx  += 2;
3036:             tmp0  = x[i1];
3037:             tmp1  = x[i2];
3038:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3039:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3040:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3041:           }

3043:           if (n == sz-1) {
3044:             tmp0  = x[*idx];
3045:             sum1 -= *v1*tmp0;
3046:             sum2 -= *v2*tmp0;
3047:             sum3 -= *v3*tmp0;
3048:           }
3049:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3050:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3051:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3052:           break;
3053:         case 4:

3055:           sum1 = xb[row];
3056:           sum2 = xb[row-1];
3057:           sum3 = xb[row-2];
3058:           sum4 = xb[row-3];
3059:           v2   = a->a + diag[row-1] + 2;
3060:           v3   = a->a + diag[row-2] + 3;
3061:           v4   = a->a + diag[row-3] + 4;
3062:           for (n = 0; n<sz-1; n+=2) {
3063:             i1    = idx[0];
3064:             i2    = idx[1];
3065:             idx  += 2;
3066:             tmp0  = x[i1];
3067:             tmp1  = x[i2];
3068:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3072:           }

3074:           if (n == sz-1) {
3075:             tmp0  = x[*idx];
3076:             sum1 -= *v1*tmp0;
3077:             sum2 -= *v2*tmp0;
3078:             sum3 -= *v3*tmp0;
3079:             sum4 -= *v4*tmp0;
3080:           }
3081:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3082:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3083:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3084:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3085:           break;
3086:         case 5:

3088:           sum1 = xb[row];
3089:           sum2 = xb[row-1];
3090:           sum3 = xb[row-2];
3091:           sum4 = xb[row-3];
3092:           sum5 = xb[row-4];
3093:           v2   = a->a + diag[row-1] + 2;
3094:           v3   = a->a + diag[row-2] + 3;
3095:           v4   = a->a + diag[row-3] + 4;
3096:           v5   = a->a + diag[row-4] + 5;
3097:           for (n = 0; n<sz-1; n+=2) {
3098:             i1    = idx[0];
3099:             i2    = idx[1];
3100:             idx  += 2;
3101:             tmp0  = x[i1];
3102:             tmp1  = x[i2];
3103:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3108:           }

3110:           if (n == sz-1) {
3111:             tmp0  = x[*idx];
3112:             sum1 -= *v1*tmp0;
3113:             sum2 -= *v2*tmp0;
3114:             sum3 -= *v3*tmp0;
3115:             sum4 -= *v4*tmp0;
3116:             sum5 -= *v5*tmp0;
3117:           }
3118:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3119:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3120:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3121:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3122:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3123:           break;
3124:         default:
3125:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3126:         }
3127:       }

3129:       PetscLogFlops(a->nz);
3130:     }
3131:     its--;
3132:   }
3133:   while (its--) {

3135:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3136:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3137:            i<m;
3138:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3140:         sz  = diag[row] - ii[row];
3141:         v1  = a->a + ii[row];
3142:         idx = a->j + ii[row];
3143:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3144:         switch (sizes[i]) {
3145:         case 1:
3146:           sum1 = b[row];
3147:           for (n = 0; n<sz-1; n+=2) {
3148:             i1    = idx[0];
3149:             i2    = idx[1];
3150:             idx  += 2;
3151:             tmp0  = x[i1];
3152:             tmp1  = x[i2];
3153:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154:           }
3155:           if (n == sz-1) {
3156:             tmp0  = x[*idx++];
3157:             sum1 -= *v1 * tmp0;
3158:             v1++;
3159:           }
3160:           t[row]   = sum1;
3161:           sz      = ii[row+1] - diag[row] - 1;
3162:           idx     = a->j + diag[row] + 1;
3163:           v1 += 1;
3164:           for (n = 0; n<sz-1; n+=2) {
3165:             i1    = idx[0];
3166:             i2    = idx[1];
3167:             idx  += 2;
3168:             tmp0  = x[i1];
3169:             tmp1  = x[i2];
3170:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3171:           }
3172:           if (n == sz-1) {
3173:             tmp0  = x[*idx++];
3174:             sum1 -= *v1 * tmp0;
3175:           }
3176:           /* in MatSOR_SeqAIJ this line would be
3177:            *
3178:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3179:            *
3180:            * but omega == 1, so this becomes
3181:            *
3182:            * x[row] = sum1*(*ibdiag++);
3183:            *
3184:            */
3185:           x[row] = sum1*(*ibdiag);
3186:           break;
3187:         case 2:
3188:           v2   = a->a + ii[row+1];
3189:           sum1 = b[row];
3190:           sum2 = b[row+1];
3191:           for (n = 0; n<sz-1; n+=2) {
3192:             i1    = idx[0];
3193:             i2    = idx[1];
3194:             idx  += 2;
3195:             tmp0  = x[i1];
3196:             tmp1  = x[i2];
3197:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3199:           }
3200:           if (n == sz-1) {
3201:             tmp0  = x[*idx++];
3202:             sum1 -= v1[0] * tmp0;
3203:             sum2 -= v2[0] * tmp0;
3204:             v1++; v2++;
3205:           }
3206:           t[row]   = sum1;
3207:           t[row+1] = sum2;
3208:           sz      = ii[row+1] - diag[row] - 2;
3209:           idx     = a->j + diag[row] + 2;
3210:           v1 += 2;
3211:           v2 += 2;
3212:           for (n = 0; n<sz-1; n+=2) {
3213:             i1    = idx[0];
3214:             i2    = idx[1];
3215:             idx  += 2;
3216:             tmp0  = x[i1];
3217:             tmp1  = x[i2];
3218:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3219:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3220:           }
3221:           if (n == sz-1) {
3222:             tmp0  = x[*idx];
3223:             sum1 -= v1[0] * tmp0;
3224:             sum2 -= v2[0] * tmp0;
3225:           }
3226:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3227:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3228:           break;
3229:         case 3:
3230:           v2   = a->a + ii[row+1];
3231:           v3   = a->a + ii[row+2];
3232:           sum1 = b[row];
3233:           sum2 = b[row+1];
3234:           sum3 = b[row+2];
3235:           for (n = 0; n<sz-1; n+=2) {
3236:             i1    = idx[0];
3237:             i2    = idx[1];
3238:             idx  += 2;
3239:             tmp0  = x[i1];
3240:             tmp1  = x[i2];
3241:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244:           }
3245:           if (n == sz-1) {
3246:             tmp0  = x[*idx++];
3247:             sum1 -= v1[0] * tmp0;
3248:             sum2 -= v2[0] * tmp0;
3249:             sum3 -= v3[0] * tmp0;
3250:             v1++; v2++; v3++;
3251:           }
3252:           t[row]   = sum1;
3253:           t[row+1] = sum2;
3254:           t[row+2] = sum3;
3255:           sz      = ii[row+1] - diag[row] - 3;
3256:           idx     = a->j + diag[row] + 3;
3257:           v1 += 3;
3258:           v2 += 3;
3259:           v3 += 3;
3260:           for (n = 0; n<sz-1; n+=2) {
3261:             i1    = idx[0];
3262:             i2    = idx[1];
3263:             idx  += 2;
3264:             tmp0  = x[i1];
3265:             tmp1  = x[i2];
3266:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3267:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3268:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3269:           }
3270:           if (n == sz-1) {
3271:             tmp0  = x[*idx];
3272:             sum1 -= v1[0] * tmp0;
3273:             sum2 -= v2[0] * tmp0;
3274:             sum3 -= v3[0] * tmp0;
3275:           }
3276:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3277:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3278:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3279:           break;
3280:         case 4:
3281:           v2   = a->a + ii[row+1];
3282:           v3   = a->a + ii[row+2];
3283:           v4   = a->a + ii[row+3];
3284:           sum1 = b[row];
3285:           sum2 = b[row+1];
3286:           sum3 = b[row+2];
3287:           sum4 = b[row+3];
3288:           for (n = 0; n<sz-1; n+=2) {
3289:             i1    = idx[0];
3290:             i2    = idx[1];
3291:             idx  += 2;
3292:             tmp0  = x[i1];
3293:             tmp1  = x[i2];
3294:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3295:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3296:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3297:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3298:           }
3299:           if (n == sz-1) {
3300:             tmp0  = x[*idx++];
3301:             sum1 -= v1[0] * tmp0;
3302:             sum2 -= v2[0] * tmp0;
3303:             sum3 -= v3[0] * tmp0;
3304:             sum4 -= v4[0] * tmp0;
3305:             v1++; v2++; v3++; v4++;
3306:           }
3307:           t[row]   = sum1;
3308:           t[row+1] = sum2;
3309:           t[row+2] = sum3;
3310:           t[row+3] = sum4;
3311:           sz      = ii[row+1] - diag[row] - 4;
3312:           idx     = a->j + diag[row] + 4;
3313:           v1 += 4;
3314:           v2 += 4;
3315:           v3 += 4;
3316:           v4 += 4;
3317:           for (n = 0; n<sz-1; n+=2) {
3318:             i1    = idx[0];
3319:             i2    = idx[1];
3320:             idx  += 2;
3321:             tmp0  = x[i1];
3322:             tmp1  = x[i2];
3323:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3324:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3325:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3326:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3327:           }
3328:           if (n == sz-1) {
3329:             tmp0  = x[*idx];
3330:             sum1 -= v1[0] * tmp0;
3331:             sum2 -= v2[0] * tmp0;
3332:             sum3 -= v3[0] * tmp0;
3333:             sum4 -= v4[0] * tmp0;
3334:           }
3335:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3336:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3337:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3338:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3339:           break;
3340:         case 5:
3341:           v2   = a->a + ii[row+1];
3342:           v3   = a->a + ii[row+2];
3343:           v4   = a->a + ii[row+3];
3344:           v5   = a->a + ii[row+4];
3345:           sum1 = b[row];
3346:           sum2 = b[row+1];
3347:           sum3 = b[row+2];
3348:           sum4 = b[row+3];
3349:           sum5 = b[row+4];
3350:           for (n = 0; n<sz-1; n+=2) {
3351:             i1    = idx[0];
3352:             i2    = idx[1];
3353:             idx  += 2;
3354:             tmp0  = x[i1];
3355:             tmp1  = x[i2];
3356:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3358:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3359:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3360:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3361:           }
3362:           if (n == sz-1) {
3363:             tmp0  = x[*idx++];
3364:             sum1 -= v1[0] * tmp0;
3365:             sum2 -= v2[0] * tmp0;
3366:             sum3 -= v3[0] * tmp0;
3367:             sum4 -= v4[0] * tmp0;
3368:             sum5 -= v5[0] * tmp0;
3369:             v1++; v2++; v3++; v4++; v5++;
3370:           }
3371:           t[row]   = sum1;
3372:           t[row+1] = sum2;
3373:           t[row+2] = sum3;
3374:           t[row+3] = sum4;
3375:           t[row+4] = sum5;
3376:           sz      = ii[row+1] - diag[row] - 5;
3377:           idx     = a->j + diag[row] + 5;
3378:           v1 += 5;
3379:           v2 += 5;
3380:           v3 += 5;
3381:           v4 += 5;
3382:           v5 += 5;
3383:           for (n = 0; n<sz-1; n+=2) {
3384:             i1    = idx[0];
3385:             i2    = idx[1];
3386:             idx  += 2;
3387:             tmp0  = x[i1];
3388:             tmp1  = x[i2];
3389:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3390:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3391:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3392:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3393:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3394:           }
3395:           if (n == sz-1) {
3396:             tmp0  = x[*idx];
3397:             sum1 -= v1[0] * tmp0;
3398:             sum2 -= v2[0] * tmp0;
3399:             sum3 -= v3[0] * tmp0;
3400:             sum4 -= v4[0] * tmp0;
3401:             sum5 -= v5[0] * tmp0;
3402:           }
3403:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3404:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3405:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3406:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3407:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3408:           break;
3409:         default:
3410:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3411:         }
3412:       }
3413:       xb   = t;
3414:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3415:     } else xb = b;

3417:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3419:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3420:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3421:         ibdiag -= sizes[i]*sizes[i];

3423:         /* set RHS */
3424:         if (xb == b) {
3425:           /* whole (old way) */
3426:           sz      = ii[row+1] - ii[row];
3427:           idx     = a->j + ii[row];
3428:           switch (sizes[i]) {
3429:           case 5:
3430:             v5      = a->a + ii[row-4];
3431:           case 4: /* fall through */
3432:             v4      = a->a + ii[row-3];
3433:           case 3:
3434:             v3      = a->a + ii[row-2];
3435:           case 2:
3436:             v2      = a->a + ii[row-1];
3437:           case 1:
3438:             v1      = a->a + ii[row];
3439:             break;
3440:           default:
3441:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3442:           }
3443:         } else {
3444:           /* upper, no diag */
3445:           sz      = ii[row+1] - diag[row] - 1;
3446:           idx     = a->j + diag[row] + 1;
3447:           switch (sizes[i]) {
3448:           case 5:
3449:             v5      = a->a + diag[row-4] + 5;
3450:           case 4: /* fall through */
3451:             v4      = a->a + diag[row-3] + 4;
3452:           case 3:
3453:             v3      = a->a + diag[row-2] + 3;
3454:           case 2:
3455:             v2      = a->a + diag[row-1] + 2;
3456:           case 1:
3457:             v1      = a->a + diag[row] + 1;
3458:           }
3459:         }
3460:         /* set sum */
3461:         switch (sizes[i]) {
3462:         case 5:
3463:           sum5 = xb[row-4];
3464:         case 4: /* fall through */
3465:           sum4 = xb[row-3];
3466:         case 3:
3467:           sum3 = xb[row-2];
3468:         case 2:
3469:           sum2 = xb[row-1];
3470:         case 1:
3471:           /* note that sum1 is associated with the last row */
3472:           sum1 = xb[row];
3473:         }
3474:         /* do sums */
3475:         for (n = 0; n<sz-1; n+=2) {
3476:             i1    = idx[0];
3477:             i2    = idx[1];
3478:             idx  += 2;
3479:             tmp0  = x[i1];
3480:             tmp1  = x[i2];
3481:             switch (sizes[i]) {
3482:             case 5:
3483:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3484:             case 4: /* fall through */
3485:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3486:             case 3:
3487:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3488:             case 2:
3489:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3490:             case 1:
3491:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3492:             }
3493:         }
3494:         /* ragged edge */
3495:         if (n == sz-1) {
3496:           tmp0  = x[*idx];
3497:           switch (sizes[i]) {
3498:           case 5:
3499:             sum5 -= *v5*tmp0;
3500:           case 4: /* fall through */
3501:             sum4 -= *v4*tmp0;
3502:           case 3:
3503:             sum3 -= *v3*tmp0;
3504:           case 2:
3505:             sum2 -= *v2*tmp0;
3506:           case 1:
3507:             sum1 -= *v1*tmp0;
3508:           }
3509:         }
3510:         /* update */
3511:         if (xb == b) {
3512:           /* whole (old way) w/ diag */
3513:           switch (sizes[i]) {
3514:           case 5:
3515:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3516:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3517:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3518:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3519:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3520:             break;
3521:           case 4:
3522:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3523:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3524:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3525:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3526:             break;
3527:           case 3:
3528:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3529:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3530:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3531:             break;
3532:           case 2:
3533:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3534:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3535:             break;
3536:           case 1:
3537:             x[row--] += sum1*(*ibdiag);
3538:             break;
3539:           }
3540:         } else {
3541:           /* no diag so set =  */
3542:           switch (sizes[i]) {
3543:           case 5:
3544:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3545:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3546:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3547:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3548:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3549:             break;
3550:           case 4:
3551:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3552:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3553:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3554:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3555:             break;
3556:           case 3:
3557:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3558:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3559:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3560:             break;
3561:           case 2:
3562:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3563:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3564:             break;
3565:           case 1:
3566:             x[row--] = sum1*(*ibdiag);
3567:             break;
3568:           }
3569:         }
3570:       }
3571:       if (xb == b) {
3572:         PetscLogFlops(2.0*a->nz);
3573:       } else {
3574:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3575:       }
3576:     }
3577:   }
3578:   if (flag & SOR_EISENSTAT) {
3579:     /*
3580:           Apply  (U + D)^-1  where D is now the block diagonal
3581:     */
3582:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3583:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3584:       ibdiag -= sizes[i]*sizes[i];
3585:       sz      = ii[row+1] - diag[row] - 1;
3586:       v1      = a->a + diag[row] + 1;
3587:       idx     = a->j + diag[row] + 1;
3588:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3589:       switch (sizes[i]) {
3590:       case 1:

3592:         sum1 = b[row];
3593:         for (n = 0; n<sz-1; n+=2) {
3594:           i1    = idx[0];
3595:           i2    = idx[1];
3596:           idx  += 2;
3597:           tmp0  = x[i1];
3598:           tmp1  = x[i2];
3599:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3600:         }

3602:         if (n == sz-1) {
3603:           tmp0  = x[*idx];
3604:           sum1 -= *v1*tmp0;
3605:         }
3606:         x[row] = sum1*(*ibdiag);row--;
3607:         break;

3609:       case 2:

3611:         sum1 = b[row];
3612:         sum2 = b[row-1];
3613:         /* note that sum1 is associated with the second of the two rows */
3614:         v2 = a->a + diag[row-1] + 2;
3615:         for (n = 0; n<sz-1; n+=2) {
3616:           i1    = idx[0];
3617:           i2    = idx[1];
3618:           idx  += 2;
3619:           tmp0  = x[i1];
3620:           tmp1  = x[i2];
3621:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3622:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3623:         }

3625:         if (n == sz-1) {
3626:           tmp0  = x[*idx];
3627:           sum1 -= *v1*tmp0;
3628:           sum2 -= *v2*tmp0;
3629:         }
3630:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3631:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3632:         row     -= 2;
3633:         break;
3634:       case 3:

3636:         sum1 = b[row];
3637:         sum2 = b[row-1];
3638:         sum3 = b[row-2];
3639:         v2   = a->a + diag[row-1] + 2;
3640:         v3   = a->a + diag[row-2] + 3;
3641:         for (n = 0; n<sz-1; n+=2) {
3642:           i1    = idx[0];
3643:           i2    = idx[1];
3644:           idx  += 2;
3645:           tmp0  = x[i1];
3646:           tmp1  = x[i2];
3647:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3648:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3649:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3650:         }

3652:         if (n == sz-1) {
3653:           tmp0  = x[*idx];
3654:           sum1 -= *v1*tmp0;
3655:           sum2 -= *v2*tmp0;
3656:           sum3 -= *v3*tmp0;
3657:         }
3658:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3659:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3660:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3661:         row     -= 3;
3662:         break;
3663:       case 4:

3665:         sum1 = b[row];
3666:         sum2 = b[row-1];
3667:         sum3 = b[row-2];
3668:         sum4 = b[row-3];
3669:         v2   = a->a + diag[row-1] + 2;
3670:         v3   = a->a + diag[row-2] + 3;
3671:         v4   = a->a + diag[row-3] + 4;
3672:         for (n = 0; n<sz-1; n+=2) {
3673:           i1    = idx[0];
3674:           i2    = idx[1];
3675:           idx  += 2;
3676:           tmp0  = x[i1];
3677:           tmp1  = x[i2];
3678:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3679:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3680:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3681:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3682:         }

3684:         if (n == sz-1) {
3685:           tmp0  = x[*idx];
3686:           sum1 -= *v1*tmp0;
3687:           sum2 -= *v2*tmp0;
3688:           sum3 -= *v3*tmp0;
3689:           sum4 -= *v4*tmp0;
3690:         }
3691:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3692:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3693:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3694:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3695:         row     -= 4;
3696:         break;
3697:       case 5:

3699:         sum1 = b[row];
3700:         sum2 = b[row-1];
3701:         sum3 = b[row-2];
3702:         sum4 = b[row-3];
3703:         sum5 = b[row-4];
3704:         v2   = a->a + diag[row-1] + 2;
3705:         v3   = a->a + diag[row-2] + 3;
3706:         v4   = a->a + diag[row-3] + 4;
3707:         v5   = a->a + diag[row-4] + 5;
3708:         for (n = 0; n<sz-1; n+=2) {
3709:           i1    = idx[0];
3710:           i2    = idx[1];
3711:           idx  += 2;
3712:           tmp0  = x[i1];
3713:           tmp1  = x[i2];
3714:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3715:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3716:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3717:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3718:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3719:         }

3721:         if (n == sz-1) {
3722:           tmp0  = x[*idx];
3723:           sum1 -= *v1*tmp0;
3724:           sum2 -= *v2*tmp0;
3725:           sum3 -= *v3*tmp0;
3726:           sum4 -= *v4*tmp0;
3727:           sum5 -= *v5*tmp0;
3728:         }
3729:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3730:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3731:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3732:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3733:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3734:         row     -= 5;
3735:         break;
3736:       default:
3737:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3738:       }
3739:     }
3740:     PetscLogFlops(a->nz);

3742:     /*
3743:            t = b - D x    where D is the block diagonal
3744:     */
3745:     cnt = 0;
3746:     for (i=0, row=0; i<m; i++) {
3747:       switch (sizes[i]) {
3748:       case 1:
3749:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3750:         break;
3751:       case 2:
3752:         x1       = x[row]; x2 = x[row+1];
3753:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3754:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3755:         t[row]   = b[row] - tmp1;
3756:         t[row+1] = b[row+1] - tmp2; row += 2;
3757:         cnt     += 4;
3758:         break;
3759:       case 3:
3760:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3761:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3762:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3763:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3764:         t[row]   = b[row] - tmp1;
3765:         t[row+1] = b[row+1] - tmp2;
3766:         t[row+2] = b[row+2] - tmp3; row += 3;
3767:         cnt     += 9;
3768:         break;
3769:       case 4:
3770:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3771:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3772:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3773:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3774:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3775:         t[row]   = b[row] - tmp1;
3776:         t[row+1] = b[row+1] - tmp2;
3777:         t[row+2] = b[row+2] - tmp3;
3778:         t[row+3] = b[row+3] - tmp4; row += 4;
3779:         cnt     += 16;
3780:         break;
3781:       case 5:
3782:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3783:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3784:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3785:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3786:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3787:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3788:         t[row]   = b[row] - tmp1;
3789:         t[row+1] = b[row+1] - tmp2;
3790:         t[row+2] = b[row+2] - tmp3;
3791:         t[row+3] = b[row+3] - tmp4;
3792:         t[row+4] = b[row+4] - tmp5;row += 5;
3793:         cnt     += 25;
3794:         break;
3795:       default:
3796:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3797:       }
3798:     }
3799:     PetscLogFlops(m);



3803:     /*
3804:           Apply (L + D)^-1 where D is the block diagonal
3805:     */
3806:     for (i=0, row=0; i<m; i++) {
3807:       sz  = diag[row] - ii[row];
3808:       v1  = a->a + ii[row];
3809:       idx = a->j + ii[row];
3810:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3811:       switch (sizes[i]) {
3812:       case 1:

3814:         sum1 = t[row];
3815:         for (n = 0; n<sz-1; n+=2) {
3816:           i1    = idx[0];
3817:           i2    = idx[1];
3818:           idx  += 2;
3819:           tmp0  = t[i1];
3820:           tmp1  = t[i2];
3821:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3822:         }

3824:         if (n == sz-1) {
3825:           tmp0  = t[*idx];
3826:           sum1 -= *v1 * tmp0;
3827:         }
3828:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3829:         break;
3830:       case 2:
3831:         v2   = a->a + ii[row+1];
3832:         sum1 = t[row];
3833:         sum2 = t[row+1];
3834:         for (n = 0; n<sz-1; n+=2) {
3835:           i1    = idx[0];
3836:           i2    = idx[1];
3837:           idx  += 2;
3838:           tmp0  = t[i1];
3839:           tmp1  = t[i2];
3840:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3841:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3842:         }

3844:         if (n == sz-1) {
3845:           tmp0  = t[*idx];
3846:           sum1 -= v1[0] * tmp0;
3847:           sum2 -= v2[0] * tmp0;
3848:         }
3849:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3850:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3851:         ibdiag   += 4; row += 2;
3852:         break;
3853:       case 3:
3854:         v2   = a->a + ii[row+1];
3855:         v3   = a->a + ii[row+2];
3856:         sum1 = t[row];
3857:         sum2 = t[row+1];
3858:         sum3 = t[row+2];
3859:         for (n = 0; n<sz-1; n+=2) {
3860:           i1    = idx[0];
3861:           i2    = idx[1];
3862:           idx  += 2;
3863:           tmp0  = t[i1];
3864:           tmp1  = t[i2];
3865:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3866:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3867:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3868:         }

3870:         if (n == sz-1) {
3871:           tmp0  = t[*idx];
3872:           sum1 -= v1[0] * tmp0;
3873:           sum2 -= v2[0] * tmp0;
3874:           sum3 -= v3[0] * tmp0;
3875:         }
3876:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3877:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3878:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3879:         ibdiag   += 9; row += 3;
3880:         break;
3881:       case 4:
3882:         v2   = a->a + ii[row+1];
3883:         v3   = a->a + ii[row+2];
3884:         v4   = a->a + ii[row+3];
3885:         sum1 = t[row];
3886:         sum2 = t[row+1];
3887:         sum3 = t[row+2];
3888:         sum4 = t[row+3];
3889:         for (n = 0; n<sz-1; n+=2) {
3890:           i1    = idx[0];
3891:           i2    = idx[1];
3892:           idx  += 2;
3893:           tmp0  = t[i1];
3894:           tmp1  = t[i2];
3895:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3896:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3897:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3898:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3899:         }

3901:         if (n == sz-1) {
3902:           tmp0  = t[*idx];
3903:           sum1 -= v1[0] * tmp0;
3904:           sum2 -= v2[0] * tmp0;
3905:           sum3 -= v3[0] * tmp0;
3906:           sum4 -= v4[0] * tmp0;
3907:         }
3908:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3909:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3910:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3911:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3912:         ibdiag   += 16; row += 4;
3913:         break;
3914:       case 5:
3915:         v2   = a->a + ii[row+1];
3916:         v3   = a->a + ii[row+2];
3917:         v4   = a->a + ii[row+3];
3918:         v5   = a->a + ii[row+4];
3919:         sum1 = t[row];
3920:         sum2 = t[row+1];
3921:         sum3 = t[row+2];
3922:         sum4 = t[row+3];
3923:         sum5 = t[row+4];
3924:         for (n = 0; n<sz-1; n+=2) {
3925:           i1    = idx[0];
3926:           i2    = idx[1];
3927:           idx  += 2;
3928:           tmp0  = t[i1];
3929:           tmp1  = t[i2];
3930:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3935:         }

3937:         if (n == sz-1) {
3938:           tmp0  = t[*idx];
3939:           sum1 -= v1[0] * tmp0;
3940:           sum2 -= v2[0] * tmp0;
3941:           sum3 -= v3[0] * tmp0;
3942:           sum4 -= v4[0] * tmp0;
3943:           sum5 -= v5[0] * tmp0;
3944:         }
3945:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3946:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3947:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3948:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3949:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3950:         ibdiag   += 25; row += 5;
3951:         break;
3952:       default:
3953:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3954:       }
3955:     }
3956:     PetscLogFlops(a->nz);
3957:   }
3958:   VecRestoreArray(xx,&x);
3959:   VecRestoreArrayRead(bb,&b);
3960:   return(0);
3961: }

3963: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3964: {
3965:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3966:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3967:   const MatScalar   *bdiag = a->inode.bdiag;
3968:   const PetscScalar *b;
3969:   PetscErrorCode    ierr;
3970:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3971:   const PetscInt    *sizes = a->inode.size;

3974:   VecGetArray(xx,&x);
3975:   VecGetArrayRead(bb,&b);
3976:   cnt  = 0;
3977:   for (i=0, row=0; i<m; i++) {
3978:     switch (sizes[i]) {
3979:     case 1:
3980:       x[row] = b[row]*bdiag[cnt++];row++;
3981:       break;
3982:     case 2:
3983:       x1       = b[row]; x2 = b[row+1];
3984:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3985:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3986:       x[row++] = tmp1;
3987:       x[row++] = tmp2;
3988:       cnt     += 4;
3989:       break;
3990:     case 3:
3991:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3992:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3993:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3994:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3995:       x[row++] = tmp1;
3996:       x[row++] = tmp2;
3997:       x[row++] = tmp3;
3998:       cnt     += 9;
3999:       break;
4000:     case 4:
4001:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4002:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4003:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4004:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4005:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4006:       x[row++] = tmp1;
4007:       x[row++] = tmp2;
4008:       x[row++] = tmp3;
4009:       x[row++] = tmp4;
4010:       cnt     += 16;
4011:       break;
4012:     case 5:
4013:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4014:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4015:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4016:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4017:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4018:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4019:       x[row++] = tmp1;
4020:       x[row++] = tmp2;
4021:       x[row++] = tmp3;
4022:       x[row++] = tmp4;
4023:       x[row++] = tmp5;
4024:       cnt     += 25;
4025:       break;
4026:     default:
4027:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4028:     }
4029:   }
4030:   PetscLogFlops(2*cnt);
4031:   VecRestoreArray(xx,&x);
4032:   VecRestoreArrayRead(bb,&b);
4033:   return(0);
4034: }

4036: /*
4037:     samestructure indicates that the matrix has not changed its nonzero structure so we
4038:     do not need to recompute the inodes
4039: */
4040: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4041: {
4042:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4044:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4045:   PetscBool      flag;
4046:   const PetscInt *idx,*idy,*ii;

4049:   if (!a->inode.use) return(0);
4050:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4052:   m = A->rmap->n;
4053:   if (a->inode.size) ns = a->inode.size;
4054:   else {
4055:     PetscMalloc1(m+1,&ns);
4056:   }

4058:   i          = 0;
4059:   node_count = 0;
4060:   idx        = a->j;
4061:   ii         = a->i;
4062:   while (i < m) {                /* For each row */
4063:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4064:     /* Limits the number of elements in a node to 'a->inode.limit' */
4065:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4066:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4067:       if (nzy != nzx) break;
4068:       idy += nzx;              /* Same nonzero pattern */
4069:       PetscArraycmp(idx,idy,nzx,&flag);
4070:       if (!flag) break;
4071:     }
4072:     ns[node_count++] = blk_size;
4073:     idx             += blk_size*nzx;
4074:     i                = j;
4075:   }

4077:   {
4078:     PetscBool is_cudatype;
4079:     PetscObjectTypeCompareAny((PetscObject)A,&is_cudatype, MATAIJCUSPARSE, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MATAIJVIENNACL, MATSEQAIJVIENNACL, MATMPIAIJVIENNACL,"");
4080:     if (is_cudatype) {
4081:       PetscInfo(A,"Not using Inode routines on GPU matrix\n");
4082:       PetscFree(ns);
4083:       a->inode.node_count       = 0;
4084:       a->inode.size             = NULL;
4085:       a->inode.use              = PETSC_FALSE;
4086:       a->inode.checked          = PETSC_TRUE;
4087:       a->inode.mat_nonzerostate = A->nonzerostate;
4088:       return(0);
4089:     }
4090:   }

4092:   /* If not enough inodes found,, do not use inode version of the routines */
4093:   if (!m || node_count > .8*m) {
4094:     PetscFree(ns);

4096:     a->inode.node_count       = 0;
4097:     a->inode.size             = NULL;
4098:     a->inode.use              = PETSC_FALSE;
4099:     A->ops->mult              = MatMult_SeqAIJ;
4100:     A->ops->sor               = MatSOR_SeqAIJ;
4101:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4102:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4103:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4104:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4105:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4106:     A->ops->coloringpatch     = 0;
4107:     A->ops->multdiagonalblock = 0;

4109:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4110:   } else {
4111:     if (!A->factortype) {
4112:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4113:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4114:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4115:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4116:       if (A->rmap->n == A->cmap->n) {
4117:         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4118:         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4119:         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4120:         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4121:         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4122:       }
4123:     } else {
4124:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4125:     }
4126:     a->inode.node_count = node_count;
4127:     a->inode.size       = ns;
4128:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4129:   }
4130:   a->inode.checked          = PETSC_TRUE;
4131:   a->inode.mat_nonzerostate = A->nonzerostate;
4132:   return(0);
4133: }

4135: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4136: {
4137:   Mat            B =*C;
4138:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4140:   PetscInt       m=A->rmap->n;

4143:   c->inode.use       = a->inode.use;
4144:   c->inode.limit     = a->inode.limit;
4145:   c->inode.max_limit = a->inode.max_limit;
4146:   if (a->inode.size) {
4147:     PetscMalloc1(m+1,&c->inode.size);
4148:     c->inode.node_count = a->inode.node_count;
4149:     PetscArraycpy(c->inode.size,a->inode.size,m+1);
4150:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4151:     if (!B->factortype) {
4152:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4153:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4154:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4155:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4156:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4157:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4158:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4159:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4160:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4161:     } else {
4162:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4163:     }
4164:   } else {
4165:     c->inode.size       = 0;
4166:     c->inode.node_count = 0;
4167:   }
4168:   c->inode.ibdiagvalid = PETSC_FALSE;
4169:   c->inode.ibdiag      = 0;
4170:   c->inode.bdiag       = 0;
4171:   return(0);
4172: }

4174: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4175: {
4176:   PetscInt       k;
4177:   const PetscInt *vi;

4180:   vi = aj + ai[row];
4181:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4182:   vi        = aj + adiag[row];
4183:   cols[nzl] = vi[0];
4184:   vi        = aj + adiag[row+1]+1;
4185:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4186:   return(0);
4187: }
4188: /*
4189:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4190:    Modified from MatSeqAIJCheckInode().

4192:    Input Parameters:
4193: .  Mat A - ILU or LU matrix factor

4195: */
4196: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4197: {
4198:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4200:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4201:   PetscInt       *cols1,*cols2,*ns;
4202:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4203:   PetscBool      flag;

4206:   if (!a->inode.use)    return(0);
4207:   if (a->inode.checked) return(0);

4209:   m = A->rmap->n;
4210:   if (a->inode.size) ns = a->inode.size;
4211:   else {
4212:     PetscMalloc1(m+1,&ns);
4213:   }

4215:   i          = 0;
4216:   node_count = 0;
4217:   PetscMalloc2(m,&cols1,m,&cols2);
4218:   while (i < m) {                /* For each row */
4219:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4220:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4221:     nzx  = nzl1 + nzu1 + 1;
4222:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4224:     /* Limits the number of elements in a node to 'a->inode.limit' */
4225:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4226:       nzl2 = ai[j+1] - ai[j];
4227:       nzu2 = adiag[j] - adiag[j+1] - 1;
4228:       nzy  = nzl2 + nzu2 + 1;
4229:       if (nzy != nzx) break;
4230:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4231:       PetscArraycmp(cols1,cols2,nzx,&flag);
4232:       if (!flag) break;
4233:     }
4234:     ns[node_count++] = blk_size;
4235:     i                = j;
4236:   }
4237:   PetscFree2(cols1,cols2);
4238:   /* If not enough inodes found,, do not use inode version of the routines */
4239:   if (!m || node_count > .8*m) {
4240:     PetscFree(ns);

4242:     a->inode.node_count = 0;
4243:     a->inode.size       = NULL;
4244:     a->inode.use        = PETSC_FALSE;

4246:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4247:   } else {
4248:     A->ops->mult              = 0;
4249:     A->ops->sor               = 0;
4250:     A->ops->multadd           = 0;
4251:     A->ops->getrowij          = 0;
4252:     A->ops->restorerowij      = 0;
4253:     A->ops->getcolumnij       = 0;
4254:     A->ops->restorecolumnij   = 0;
4255:     A->ops->coloringpatch     = 0;
4256:     A->ops->multdiagonalblock = 0;
4257:     a->inode.node_count       = node_count;
4258:     a->inode.size             = ns;

4260:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4261:   }
4262:   a->inode.checked = PETSC_TRUE;
4263:   return(0);
4264: }

4266: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4267: {
4268:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4271:   a->inode.ibdiagvalid = PETSC_FALSE;
4272:   return(0);
4273: }

4275: /*
4276:      This is really ugly. if inodes are used this replaces the
4277:   permutations with ones that correspond to rows/cols of the matrix
4278:   rather then inode blocks
4279: */
4280: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4281: {

4285:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4286:   return(0);
4287: }

4289: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4290: {
4291:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4293:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4294:   const PetscInt *ridx,*cidx;
4295:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4296:   PetscInt       nslim_col,*ns_col;
4297:   IS             ris = *rperm,cis = *cperm;

4300:   if (!a->inode.size) return(0); /* no inodes so return */
4301:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4303:   MatCreateColInode_Private(A,&nslim_col,&ns_col);
4304:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4305:   PetscMalloc2(m,&permr,n,&permc);

4307:   ISGetIndices(ris,&ridx);
4308:   ISGetIndices(cis,&cidx);

4310:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4311:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4313:   /* Construct the permutations for rows*/
4314:   for (i=0,row = 0; i<nslim_row; ++i) {
4315:     indx      = ridx[i];
4316:     start_val = tns[indx];
4317:     end_val   = tns[indx + 1];
4318:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4319:   }

4321:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4322:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4324:   /* Construct permutations for columns */
4325:   for (i=0,col=0; i<nslim_col; ++i) {
4326:     indx      = cidx[i];
4327:     start_val = tns[indx];
4328:     end_val   = tns[indx + 1];
4329:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4330:   }

4332:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4333:   ISSetPermutation(*rperm);
4334:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4335:   ISSetPermutation(*cperm);

4337:   ISRestoreIndices(ris,&ridx);
4338:   ISRestoreIndices(cis,&cidx);

4340:   PetscFree(ns_col);
4341:   PetscFree2(permr,permc);
4342:   ISDestroy(&cis);
4343:   ISDestroy(&ris);
4344:   PetscFree(tns);
4345:   return(0);
4346: }

4348: /*@C
4349:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4351:    Not Collective

4353:    Input Parameter:
4354: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4356:    Output Parameter:
4357: +  node_count - no of inodes present in the matrix.
4358: .  sizes      - an array of size node_count,with sizes of each inode.
4359: -  limit      - the max size used to generate the inodes.

4361:    Level: advanced

4363:    Notes:
4364:     This routine returns some internal storage information
4365:    of the matrix, it is intended to be used by advanced users.
4366:    It should be called after the matrix is assembled.
4367:    The contents of the sizes[] array should not be changed.
4368:    NULL may be passed for information not requested.

4370: .seealso: MatGetInfo()
4371: @*/
4372: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4373: {
4374:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4377:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4378:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4379:   if (f) {
4380:     (*f)(A,node_count,sizes,limit);
4381:   }
4382:   return(0);
4383: }

4385: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4386: {
4387:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4390:   if (node_count) *node_count = a->inode.node_count;
4391:   if (sizes)      *sizes      = a->inode.size;
4392:   if (limit)      *limit      = a->inode.limit;
4393:   return(0);
4394: }