Actual source code: inode.c


  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 11:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 13:   n = A->cmap->n;
 14:   m = A->rmap->n;
 16:   ns_row = a->inode.size;

 18:   min_mn = (m < n) ? m : n;
 19:   if (!ns) {
 20:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 21:     for (; count+1 < n; count++,i++) ;
 22:     if (count < n)  {
 23:       i++;
 24:     }
 25:     *size = i;
 26:     return 0;
 27:   }
 28:   PetscMalloc1(n+1,&ns_col);

 30:   /* Use the same row structure wherever feasible. */
 31:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 32:     ns_col[i] = ns_row[i];
 33:   }

 35:   /* if m < n; pad up the remainder with inode_limit */
 36:   for (; count+1 < n; count++,i++) {
 37:     ns_col[i] = 1;
 38:   }
 39:   /* The last node is the odd ball. padd it up with the remaining rows; */
 40:   if (count < n) {
 41:     ns_col[i] = n - count;
 42:     i++;
 43:   } else if (count > n) {
 44:     /* Adjust for the over estimation */
 45:     ns_col[i-1] += n - count;
 46:   }
 47:   *size = i;
 48:   *ns   = ns_col;
 49:   return 0;
 50: }

 52: /*
 53:       This builds symmetric version of nonzero structure,
 54: */
 55: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 56: {
 57:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 58:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 59:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 60:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 62:   nslim_row = a->inode.node_count;
 63:   m         = A->rmap->n;
 64:   n         = A->cmap->n;

 68:   /* Use the row_inode as column_inode */
 69:   nslim_col = nslim_row;
 70:   ns_col    = ns_row;

 72:   /* allocate space for reformated inode structure */
 73:   PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
 74:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 76:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 77:     nsz = ns_col[i1];
 78:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 79:   }
 80:   /* allocate space for row pointers */
 81:   PetscCalloc1(nslim_row+1,&ia);
 82:   *iia = ia;
 83:   PetscMalloc1(nslim_row+1,&work);

 85:   /* determine the number of columns in each row */
 86:   ia[0] = oshift;
 87:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 89:     j    = aj + ai[row] + ishift;
 90:     jmax = aj + ai[row+1] + ishift;
 91:     if (j==jmax) continue; /* empty row */
 92:     col  = *j++ + ishift;
 93:     i2   = tvc[col];
 94:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
 95:       ia[i1+1]++;
 96:       ia[i2+1]++;
 97:       i2++;                     /* Start col of next node */
 98:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
 99:       i2 = tvc[col];
100:     }
101:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
102:   }

104:   /* shift ia[i] to point to next row */
105:   for (i1=1; i1<nslim_row+1; i1++) {
106:     row        = ia[i1-1];
107:     ia[i1]    += row;
108:     work[i1-1] = row - oshift;
109:   }

111:   /* allocate space for column pointers */
112:   nz   = ia[nslim_row] + (!ishift);
113:   PetscMalloc1(nz,&ja);
114:   *jja = ja;

116:   /* loop over lower triangular part putting into ja */
117:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
118:     j    = aj + ai[row] + ishift;
119:     jmax = aj + ai[row+1] + ishift;
120:     if (j==jmax) continue; /* empty row */
121:     col  = *j++ + ishift;
122:     i2   = tvc[col];
123:     while (i2<i1 && j<jmax) {
124:       ja[work[i2]++] = i1 + oshift;
125:       ja[work[i1]++] = i2 + oshift;
126:       ++i2;
127:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
128:       i2 = tvc[col];
129:     }
130:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

132:   }
133:   PetscFree(work);
134:   PetscFree2(tns,tvc);
135:   return 0;
136: }

138: /*
139:       This builds nonsymmetric version of nonzero structure,
140: */
141: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
142: {
143:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
144:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
145:   PetscInt       *tns,*tvc,nsz,i1,i2;
146:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

149:   nslim_row = a->inode.node_count;
150:   n         = A->cmap->n;

152:   /* Create The column_inode for this matrix */
153:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

155:   /* allocate space for reformated column_inode structure */
156:   PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
157:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

159:   for (i1=0,col=0; i1<nslim_col; ++i1) {
160:     nsz = ns_col[i1];
161:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
162:   }
163:   /* allocate space for row pointers */
164:   PetscCalloc1(nslim_row+1,&ia);
165:   *iia = ia;
166:   PetscMalloc1(nslim_row+1,&work);

168:   /* determine the number of columns in each row */
169:   ia[0] = oshift;
170:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
171:     j   = aj + ai[row] + ishift;
172:     nz  = ai[row+1] - ai[row];
173:     if (!nz) continue; /* empty row */
174:     col = *j++ + ishift;
175:     i2  = tvc[col];
176:     while (nz-- > 0) {           /* off-diagonal elements */
177:       ia[i1+1]++;
178:       i2++;                     /* Start col of next node */
179:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
180:       if (nz > 0) i2 = tvc[col];
181:     }
182:   }

184:   /* shift ia[i] to point to next row */
185:   for (i1=1; i1<nslim_row+1; i1++) {
186:     row        = ia[i1-1];
187:     ia[i1]    += row;
188:     work[i1-1] = row - oshift;
189:   }

191:   /* allocate space for column pointers */
192:   nz   = ia[nslim_row] + (!ishift);
193:   PetscMalloc1(nz,&ja);
194:   *jja = ja;

196:   /* loop over matrix putting into ja */
197:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
198:     j   = aj + ai[row] + ishift;
199:     nz  = ai[row+1] - ai[row];
200:     if (!nz) continue; /* empty row */
201:     col = *j++ + ishift;
202:     i2  = tvc[col];
203:     while (nz-- > 0) {
204:       ja[work[i1]++] = i2 + oshift;
205:       ++i2;
206:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
207:       if (nz > 0) i2 = tvc[col];
208:     }
209:   }
210:   PetscFree(ns_col);
211:   PetscFree(work);
212:   PetscFree2(tns,tvc);
213:   return 0;
214: }

216: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
217: {
218:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

220:   *n = a->inode.node_count;
221:   if (!ia) return 0;
222:   if (!blockcompressed) {
223:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
224:   } else if (symmetric) {
225:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
226:   } else {
227:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
228:   }
229:   return 0;
230: }

232: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
233: {
234:   if (!ia) return 0;

236:   if (!blockcompressed) {
237:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
238:   } else {
239:     PetscFree(*ia);
240:     PetscFree(*ja);
241:   }
242:   return 0;
243: }

245: /* ----------------------------------------------------------- */

247: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
248: {
249:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
250:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
251:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

254:   nslim_row = a->inode.node_count;
255:   n         = A->cmap->n;

257:   /* Create The column_inode for this matrix */
258:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

260:   /* allocate space for reformated column_inode structure */
261:   PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
262:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

264:   for (i1=0,col=0; i1<nslim_col; ++i1) {
265:     nsz = ns_col[i1];
266:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
267:   }
268:   /* allocate space for column pointers */
269:   PetscCalloc1(nslim_col+1,&ia);
270:   *iia = ia;
271:   PetscMalloc1(nslim_col+1,&work);

273:   /* determine the number of columns in each row */
274:   ia[0] = oshift;
275:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
276:     j   = aj + ai[row] + ishift;
277:     col = *j++ + ishift;
278:     i2  = tvc[col];
279:     nz  = ai[row+1] - ai[row];
280:     while (nz-- > 0) {           /* off-diagonal elements */
281:       /* ia[i1+1]++; */
282:       ia[i2+1]++;
283:       i2++;
284:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285:       if (nz > 0) i2 = tvc[col];
286:     }
287:   }

289:   /* shift ia[i] to point to next col */
290:   for (i1=1; i1<nslim_col+1; i1++) {
291:     col        = ia[i1-1];
292:     ia[i1]    += col;
293:     work[i1-1] = col - oshift;
294:   }

296:   /* allocate space for column pointers */
297:   nz   = ia[nslim_col] + (!ishift);
298:   PetscMalloc1(nz,&ja);
299:   *jja = ja;

301:   /* loop over matrix putting into ja */
302:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
303:     j   = aj + ai[row] + ishift;
304:     col = *j++ + ishift;
305:     i2  = tvc[col];
306:     nz  = ai[row+1] - ai[row];
307:     while (nz-- > 0) {
308:       /* ja[work[i1]++] = i2 + oshift; */
309:       ja[work[i2]++] = i1 + oshift;
310:       i2++;
311:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312:       if (nz > 0) i2 = tvc[col];
313:     }
314:   }
315:   PetscFree(ns_col);
316:   PetscFree(work);
317:   PetscFree2(tns,tvc);
318:   return 0;
319: }

321: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
322: {
323:   MatCreateColInode_Private(A,n,NULL);
324:   if (!ia) return 0;

326:   if (!blockcompressed) {
327:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
328:   } else if (symmetric) {
329:     /* Since the indices are symmetric it doesn't matter */
330:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
331:   } else {
332:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
333:   }
334:   return 0;
335: }

337: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
338: {
339:   if (!ia) return 0;
340:   if (!blockcompressed) {
341:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
342:   } else {
343:     PetscFree(*ia);
344:     PetscFree(*ja);
345:   }
346:   return 0;
347: }

349: /* ----------------------------------------------------------- */

351: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
352: {
353:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
354:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
355:   PetscScalar       *y;
356:   const PetscScalar *x;
357:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
358:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
359:   const PetscInt    *idx,*ns,*ii;

361: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
362: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
363: #endif

366:   node_max = a->inode.node_count;
367:   ns       = a->inode.size;     /* Node Size array */
368:   VecGetArrayRead(xx,&x);
369:   VecGetArray(yy,&y);
370:   idx      = a->j;
371:   v1       = a->a;
372:   ii       = a->i;

374:   for (i = 0,row = 0; i< node_max; ++i) {
375:     nsz         = ns[i];
376:     n           = ii[1] - ii[0];
377:     nonzerorow += (n>0)*nsz;
378:     ii         += nsz;
379:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
380:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
381:     sz = n;                     /* No of non zeros in this row */
382:                                 /* Switch on the size of Node */
383:     switch (nsz) {               /* Each loop in 'case' is unrolled */
384:     case 1:
385:       sum1 = 0.;

387:       for (n = 0; n< sz-1; n+=2) {
388:         i1    = idx[0];         /* The instructions are ordered to */
389:         i2    = idx[1];         /* make the compiler's job easy */
390:         idx  += 2;
391:         tmp0  = x[i1];
392:         tmp1  = x[i2];
393:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
394:       }

396:       if (n == sz-1) {          /* Take care of the last nonzero  */
397:         tmp0  = x[*idx++];
398:         sum1 += *v1++ *tmp0;
399:       }
400:       y[row++]=sum1;
401:       break;
402:     case 2:
403:       sum1 = 0.;
404:       sum2 = 0.;
405:       v2   = v1 + n;

407:       for (n = 0; n< sz-1; n+=2) {
408:         i1    = idx[0];
409:         i2    = idx[1];
410:         idx  += 2;
411:         tmp0  = x[i1];
412:         tmp1  = x[i2];
413:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
414:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
415:       }
416:       if (n == sz-1) {
417:         tmp0  = x[*idx++];
418:         sum1 += *v1++ * tmp0;
419:         sum2 += *v2++ * tmp0;
420:       }
421:       y[row++]=sum1;
422:       y[row++]=sum2;
423:       v1      =v2;              /* Since the next block to be processed starts there*/
424:       idx    +=sz;
425:       break;
426:     case 3:
427:       sum1 = 0.;
428:       sum2 = 0.;
429:       sum3 = 0.;
430:       v2   = v1 + n;
431:       v3   = v2 + n;

433:       for (n = 0; n< sz-1; n+=2) {
434:         i1    = idx[0];
435:         i2    = idx[1];
436:         idx  += 2;
437:         tmp0  = x[i1];
438:         tmp1  = x[i2];
439:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
441:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
442:       }
443:       if (n == sz-1) {
444:         tmp0  = x[*idx++];
445:         sum1 += *v1++ * tmp0;
446:         sum2 += *v2++ * tmp0;
447:         sum3 += *v3++ * tmp0;
448:       }
449:       y[row++]=sum1;
450:       y[row++]=sum2;
451:       y[row++]=sum3;
452:       v1      =v3;              /* Since the next block to be processed starts there*/
453:       idx    +=2*sz;
454:       break;
455:     case 4:
456:       sum1 = 0.;
457:       sum2 = 0.;
458:       sum3 = 0.;
459:       sum4 = 0.;
460:       v2   = v1 + n;
461:       v3   = v2 + n;
462:       v4   = v3 + n;

464:       for (n = 0; n< sz-1; n+=2) {
465:         i1    = idx[0];
466:         i2    = idx[1];
467:         idx  += 2;
468:         tmp0  = x[i1];
469:         tmp1  = x[i2];
470:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
471:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
472:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
473:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
474:       }
475:       if (n == sz-1) {
476:         tmp0  = x[*idx++];
477:         sum1 += *v1++ * tmp0;
478:         sum2 += *v2++ * tmp0;
479:         sum3 += *v3++ * tmp0;
480:         sum4 += *v4++ * tmp0;
481:       }
482:       y[row++]=sum1;
483:       y[row++]=sum2;
484:       y[row++]=sum3;
485:       y[row++]=sum4;
486:       v1      =v4;              /* Since the next block to be processed starts there*/
487:       idx    +=3*sz;
488:       break;
489:     case 5:
490:       sum1 = 0.;
491:       sum2 = 0.;
492:       sum3 = 0.;
493:       sum4 = 0.;
494:       sum5 = 0.;
495:       v2   = v1 + n;
496:       v3   = v2 + n;
497:       v4   = v3 + n;
498:       v5   = v4 + n;

500:       for (n = 0; n<sz-1; n+=2) {
501:         i1    = idx[0];
502:         i2    = idx[1];
503:         idx  += 2;
504:         tmp0  = x[i1];
505:         tmp1  = x[i2];
506:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
507:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
508:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
509:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
510:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
511:       }
512:       if (n == sz-1) {
513:         tmp0  = x[*idx++];
514:         sum1 += *v1++ * tmp0;
515:         sum2 += *v2++ * tmp0;
516:         sum3 += *v3++ * tmp0;
517:         sum4 += *v4++ * tmp0;
518:         sum5 += *v5++ * tmp0;
519:       }
520:       y[row++]=sum1;
521:       y[row++]=sum2;
522:       y[row++]=sum3;
523:       y[row++]=sum4;
524:       y[row++]=sum5;
525:       v1      =v5;       /* Since the next block to be processed starts there */
526:       idx    +=4*sz;
527:       break;
528:     default:
529:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
530:     }
531:   }
532:   VecRestoreArrayRead(xx,&x);
533:   VecRestoreArray(yy,&y);
534:   PetscLogFlops(2.0*a->nz - nonzerorow);
535:   return 0;
536: }
537: /* ----------------------------------------------------------- */
538: /* Almost same code as the MatMult_SeqAIJ_Inode() */
539: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
540: {
541:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
542:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
543:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
544:   const PetscScalar *x;
545:   PetscScalar       *y,*z,*zt;
546:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
547:   const PetscInt    *idx,*ns,*ii;

550:   node_max = a->inode.node_count;
551:   ns       = a->inode.size;     /* Node Size array */

553:   VecGetArrayRead(xx,&x);
554:   VecGetArrayPair(zz,yy,&z,&y);
555:   zt = z;

557:   idx = a->j;
558:   v1  = a->a;
559:   ii  = a->i;

561:   for (i = 0,row = 0; i< node_max; ++i) {
562:     nsz = ns[i];
563:     n   = ii[1] - ii[0];
564:     ii += nsz;
565:     sz  = n;                    /* No of non zeros in this row */
566:                                 /* Switch on the size of Node */
567:     switch (nsz) {               /* Each loop in 'case' is unrolled */
568:     case 1:
569:       sum1 = *zt++;

571:       for (n = 0; n< sz-1; n+=2) {
572:         i1    = idx[0];         /* The instructions are ordered to */
573:         i2    = idx[1];         /* make the compiler's job easy */
574:         idx  += 2;
575:         tmp0  = x[i1];
576:         tmp1  = x[i2];
577:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
578:       }

580:       if (n   == sz-1) {          /* Take care of the last nonzero  */
581:         tmp0  = x[*idx++];
582:         sum1 += *v1++ * tmp0;
583:       }
584:       y[row++]=sum1;
585:       break;
586:     case 2:
587:       sum1 = *zt++;
588:       sum2 = *zt++;
589:       v2   = v1 + n;

591:       for (n = 0; n< sz-1; n+=2) {
592:         i1    = idx[0];
593:         i2    = idx[1];
594:         idx  += 2;
595:         tmp0  = x[i1];
596:         tmp1  = x[i2];
597:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
598:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
599:       }
600:       if (n   == sz-1) {
601:         tmp0  = x[*idx++];
602:         sum1 += *v1++ * tmp0;
603:         sum2 += *v2++ * tmp0;
604:       }
605:       y[row++]=sum1;
606:       y[row++]=sum2;
607:       v1      =v2;              /* Since the next block to be processed starts there*/
608:       idx    +=sz;
609:       break;
610:     case 3:
611:       sum1 = *zt++;
612:       sum2 = *zt++;
613:       sum3 = *zt++;
614:       v2   = v1 + n;
615:       v3   = v2 + n;

617:       for (n = 0; n< sz-1; n+=2) {
618:         i1    = idx[0];
619:         i2    = idx[1];
620:         idx  += 2;
621:         tmp0  = x[i1];
622:         tmp1  = x[i2];
623:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
624:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
625:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
626:       }
627:       if (n == sz-1) {
628:         tmp0  = x[*idx++];
629:         sum1 += *v1++ * tmp0;
630:         sum2 += *v2++ * tmp0;
631:         sum3 += *v3++ * tmp0;
632:       }
633:       y[row++]=sum1;
634:       y[row++]=sum2;
635:       y[row++]=sum3;
636:       v1      =v3;              /* Since the next block to be processed starts there*/
637:       idx    +=2*sz;
638:       break;
639:     case 4:
640:       sum1 = *zt++;
641:       sum2 = *zt++;
642:       sum3 = *zt++;
643:       sum4 = *zt++;
644:       v2   = v1 + n;
645:       v3   = v2 + n;
646:       v4   = v3 + n;

648:       for (n = 0; n< sz-1; n+=2) {
649:         i1    = idx[0];
650:         i2    = idx[1];
651:         idx  += 2;
652:         tmp0  = x[i1];
653:         tmp1  = x[i2];
654:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
655:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
656:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
657:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
658:       }
659:       if (n == sz-1) {
660:         tmp0  = x[*idx++];
661:         sum1 += *v1++ * tmp0;
662:         sum2 += *v2++ * tmp0;
663:         sum3 += *v3++ * tmp0;
664:         sum4 += *v4++ * tmp0;
665:       }
666:       y[row++]=sum1;
667:       y[row++]=sum2;
668:       y[row++]=sum3;
669:       y[row++]=sum4;
670:       v1      =v4;              /* Since the next block to be processed starts there*/
671:       idx    +=3*sz;
672:       break;
673:     case 5:
674:       sum1 = *zt++;
675:       sum2 = *zt++;
676:       sum3 = *zt++;
677:       sum4 = *zt++;
678:       sum5 = *zt++;
679:       v2   = v1 + n;
680:       v3   = v2 + n;
681:       v4   = v3 + n;
682:       v5   = v4 + n;

684:       for (n = 0; n<sz-1; n+=2) {
685:         i1    = idx[0];
686:         i2    = idx[1];
687:         idx  += 2;
688:         tmp0  = x[i1];
689:         tmp1  = x[i2];
690:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
691:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
692:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
693:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
694:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
695:       }
696:       if (n   == sz-1) {
697:         tmp0  = x[*idx++];
698:         sum1 += *v1++ * tmp0;
699:         sum2 += *v2++ * tmp0;
700:         sum3 += *v3++ * tmp0;
701:         sum4 += *v4++ * tmp0;
702:         sum5 += *v5++ * tmp0;
703:       }
704:       y[row++]=sum1;
705:       y[row++]=sum2;
706:       y[row++]=sum3;
707:       y[row++]=sum4;
708:       y[row++]=sum5;
709:       v1      =v5;       /* Since the next block to be processed starts there */
710:       idx    +=4*sz;
711:       break;
712:     default:
713:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
714:     }
715:   }
716:   VecRestoreArrayRead(xx,&x);
717:   VecRestoreArrayPair(zz,yy,&z,&y);
718:   PetscLogFlops(2.0*a->nz);
719:   return 0;
720: }

722: /* ----------------------------------------------------------- */
723: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
724: {
725:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
726:   IS                iscol = a->col,isrow = a->row;
727:   const PetscInt    *r,*c,*rout,*cout;
728:   PetscInt          i,j,n = A->rmap->n,nz;
729:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
730:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
731:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
732:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
733:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
734:   const PetscScalar *b;

737:   node_max = a->inode.node_count;
738:   ns       = a->inode.size;     /* Node Size array */

740:   VecGetArrayRead(bb,&b);
741:   VecGetArrayWrite(xx,&x);
742:   tmp  = a->solve_work;

744:   ISGetIndices(isrow,&rout); r = rout;
745:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

747:   /* forward solve the lower triangular */
748:   tmps = tmp;
749:   aa   = a_a;
750:   aj   = a_j;
751:   ad   = a->diag;

753:   for (i = 0,row = 0; i< node_max; ++i) {
754:     nsz = ns[i];
755:     aii = ai[row];
756:     v1  = aa + aii;
757:     vi  = aj + aii;
758:     nz  = ad[row]- aii;
759:     if (i < node_max-1) {
760:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
761:       * but our indexing to determine it's size could. */
762:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
763:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
764:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
765:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
766:     }

768:     switch (nsz) {               /* Each loop in 'case' is unrolled */
769:     case 1:
770:       sum1 = b[*r++];
771:       for (j=0; j<nz-1; j+=2) {
772:         i0    = vi[0];
773:         i1    = vi[1];
774:         vi   +=2;
775:         tmp0  = tmps[i0];
776:         tmp1  = tmps[i1];
777:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
778:       }
779:       if (j == nz-1) {
780:         tmp0  = tmps[*vi++];
781:         sum1 -= *v1++ *tmp0;
782:       }
783:       tmp[row++]=sum1;
784:       break;
785:     case 2:
786:       sum1 = b[*r++];
787:       sum2 = b[*r++];
788:       v2   = aa + ai[row+1];

790:       for (j=0; j<nz-1; j+=2) {
791:         i0    = vi[0];
792:         i1    = vi[1];
793:         vi   +=2;
794:         tmp0  = tmps[i0];
795:         tmp1  = tmps[i1];
796:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
797:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
798:       }
799:       if (j == nz-1) {
800:         tmp0  = tmps[*vi++];
801:         sum1 -= *v1++ *tmp0;
802:         sum2 -= *v2++ *tmp0;
803:       }
804:       sum2     -= *v2++ *sum1;
805:       tmp[row++]=sum1;
806:       tmp[row++]=sum2;
807:       break;
808:     case 3:
809:       sum1 = b[*r++];
810:       sum2 = b[*r++];
811:       sum3 = b[*r++];
812:       v2   = aa + ai[row+1];
813:       v3   = aa + ai[row+2];

815:       for (j=0; j<nz-1; j+=2) {
816:         i0    = vi[0];
817:         i1    = vi[1];
818:         vi   +=2;
819:         tmp0  = tmps[i0];
820:         tmp1  = tmps[i1];
821:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
823:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
824:       }
825:       if (j == nz-1) {
826:         tmp0  = tmps[*vi++];
827:         sum1 -= *v1++ *tmp0;
828:         sum2 -= *v2++ *tmp0;
829:         sum3 -= *v3++ *tmp0;
830:       }
831:       sum2 -= *v2++ * sum1;
832:       sum3 -= *v3++ * sum1;
833:       sum3 -= *v3++ * sum2;

835:       tmp[row++]=sum1;
836:       tmp[row++]=sum2;
837:       tmp[row++]=sum3;
838:       break;

840:     case 4:
841:       sum1 = b[*r++];
842:       sum2 = b[*r++];
843:       sum3 = b[*r++];
844:       sum4 = b[*r++];
845:       v2   = aa + ai[row+1];
846:       v3   = aa + ai[row+2];
847:       v4   = aa + ai[row+3];

849:       for (j=0; j<nz-1; j+=2) {
850:         i0    = vi[0];
851:         i1    = vi[1];
852:         vi   +=2;
853:         tmp0  = tmps[i0];
854:         tmp1  = tmps[i1];
855:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
858:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
859:       }
860:       if (j == nz-1) {
861:         tmp0  = tmps[*vi++];
862:         sum1 -= *v1++ *tmp0;
863:         sum2 -= *v2++ *tmp0;
864:         sum3 -= *v3++ *tmp0;
865:         sum4 -= *v4++ *tmp0;
866:       }
867:       sum2 -= *v2++ * sum1;
868:       sum3 -= *v3++ * sum1;
869:       sum4 -= *v4++ * sum1;
870:       sum3 -= *v3++ * sum2;
871:       sum4 -= *v4++ * sum2;
872:       sum4 -= *v4++ * sum3;

874:       tmp[row++]=sum1;
875:       tmp[row++]=sum2;
876:       tmp[row++]=sum3;
877:       tmp[row++]=sum4;
878:       break;
879:     case 5:
880:       sum1 = b[*r++];
881:       sum2 = b[*r++];
882:       sum3 = b[*r++];
883:       sum4 = b[*r++];
884:       sum5 = b[*r++];
885:       v2   = aa + ai[row+1];
886:       v3   = aa + ai[row+2];
887:       v4   = aa + ai[row+3];
888:       v5   = aa + ai[row+4];

890:       for (j=0; j<nz-1; j+=2) {
891:         i0    = vi[0];
892:         i1    = vi[1];
893:         vi   +=2;
894:         tmp0  = tmps[i0];
895:         tmp1  = tmps[i1];
896:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
897:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
898:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
899:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
900:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
901:       }
902:       if (j == nz-1) {
903:         tmp0  = tmps[*vi++];
904:         sum1 -= *v1++ *tmp0;
905:         sum2 -= *v2++ *tmp0;
906:         sum3 -= *v3++ *tmp0;
907:         sum4 -= *v4++ *tmp0;
908:         sum5 -= *v5++ *tmp0;
909:       }

911:       sum2 -= *v2++ * sum1;
912:       sum3 -= *v3++ * sum1;
913:       sum4 -= *v4++ * sum1;
914:       sum5 -= *v5++ * sum1;
915:       sum3 -= *v3++ * sum2;
916:       sum4 -= *v4++ * sum2;
917:       sum5 -= *v5++ * sum2;
918:       sum4 -= *v4++ * sum3;
919:       sum5 -= *v5++ * sum3;
920:       sum5 -= *v5++ * sum4;

922:       tmp[row++]=sum1;
923:       tmp[row++]=sum2;
924:       tmp[row++]=sum3;
925:       tmp[row++]=sum4;
926:       tmp[row++]=sum5;
927:       break;
928:     default:
929:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
930:     }
931:   }
932:   /* backward solve the upper triangular */
933:   for (i=node_max -1,row = n-1; i>=0; i--) {
934:     nsz = ns[i];
935:     aii = ai[row+1] -1;
936:     v1  = aa + aii;
937:     vi  = aj + aii;
938:     nz  = aii- ad[row];
939:     switch (nsz) {               /* Each loop in 'case' is unrolled */
940:     case 1:
941:       sum1 = tmp[row];

943:       for (j=nz; j>1; j-=2) {
944:         vi   -=2;
945:         i0    = vi[2];
946:         i1    = vi[1];
947:         tmp0  = tmps[i0];
948:         tmp1  = tmps[i1];
949:         v1   -= 2;
950:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
951:       }
952:       if (j==1) {
953:         tmp0  = tmps[*vi--];
954:         sum1 -= *v1-- * tmp0;
955:       }
956:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
957:       break;
958:     case 2:
959:       sum1 = tmp[row];
960:       sum2 = tmp[row -1];
961:       v2   = aa + ai[row]-1;
962:       for (j=nz; j>1; j-=2) {
963:         vi   -=2;
964:         i0    = vi[2];
965:         i1    = vi[1];
966:         tmp0  = tmps[i0];
967:         tmp1  = tmps[i1];
968:         v1   -= 2;
969:         v2   -= 2;
970:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
971:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
972:       }
973:       if (j==1) {
974:         tmp0  = tmps[*vi--];
975:         sum1 -= *v1-- * tmp0;
976:         sum2 -= *v2-- * tmp0;
977:       }

979:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
980:       sum2   -= *v2-- * tmp0;
981:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
982:       break;
983:     case 3:
984:       sum1 = tmp[row];
985:       sum2 = tmp[row -1];
986:       sum3 = tmp[row -2];
987:       v2   = aa + ai[row]-1;
988:       v3   = aa + ai[row -1]-1;
989:       for (j=nz; j>1; j-=2) {
990:         vi   -=2;
991:         i0    = vi[2];
992:         i1    = vi[1];
993:         tmp0  = tmps[i0];
994:         tmp1  = tmps[i1];
995:         v1   -= 2;
996:         v2   -= 2;
997:         v3   -= 2;
998:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
999:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1000:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1001:       }
1002:       if (j==1) {
1003:         tmp0  = tmps[*vi--];
1004:         sum1 -= *v1-- * tmp0;
1005:         sum2 -= *v2-- * tmp0;
1006:         sum3 -= *v3-- * tmp0;
1007:       }
1008:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1009:       sum2   -= *v2-- * tmp0;
1010:       sum3   -= *v3-- * tmp0;
1011:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012:       sum3   -= *v3-- * tmp0;
1013:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1015:       break;
1016:     case 4:
1017:       sum1 = tmp[row];
1018:       sum2 = tmp[row -1];
1019:       sum3 = tmp[row -2];
1020:       sum4 = tmp[row -3];
1021:       v2   = aa + ai[row]-1;
1022:       v3   = aa + ai[row -1]-1;
1023:       v4   = aa + ai[row -2]-1;

1025:       for (j=nz; j>1; j-=2) {
1026:         vi   -=2;
1027:         i0    = vi[2];
1028:         i1    = vi[1];
1029:         tmp0  = tmps[i0];
1030:         tmp1  = tmps[i1];
1031:         v1   -= 2;
1032:         v2   -= 2;
1033:         v3   -= 2;
1034:         v4   -= 2;
1035:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1036:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1037:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1038:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1039:       }
1040:       if (j==1) {
1041:         tmp0  = tmps[*vi--];
1042:         sum1 -= *v1-- * tmp0;
1043:         sum2 -= *v2-- * tmp0;
1044:         sum3 -= *v3-- * tmp0;
1045:         sum4 -= *v4-- * tmp0;
1046:       }

1048:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1049:       sum2   -= *v2-- * tmp0;
1050:       sum3   -= *v3-- * tmp0;
1051:       sum4   -= *v4-- * tmp0;
1052:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1053:       sum3   -= *v3-- * tmp0;
1054:       sum4   -= *v4-- * tmp0;
1055:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1056:       sum4   -= *v4-- * tmp0;
1057:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1058:       break;
1059:     case 5:
1060:       sum1 = tmp[row];
1061:       sum2 = tmp[row -1];
1062:       sum3 = tmp[row -2];
1063:       sum4 = tmp[row -3];
1064:       sum5 = tmp[row -4];
1065:       v2   = aa + ai[row]-1;
1066:       v3   = aa + ai[row -1]-1;
1067:       v4   = aa + ai[row -2]-1;
1068:       v5   = aa + ai[row -3]-1;
1069:       for (j=nz; j>1; j-=2) {
1070:         vi   -= 2;
1071:         i0    = vi[2];
1072:         i1    = vi[1];
1073:         tmp0  = tmps[i0];
1074:         tmp1  = tmps[i1];
1075:         v1   -= 2;
1076:         v2   -= 2;
1077:         v3   -= 2;
1078:         v4   -= 2;
1079:         v5   -= 2;
1080:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1081:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1082:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1083:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1084:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1085:       }
1086:       if (j==1) {
1087:         tmp0  = tmps[*vi--];
1088:         sum1 -= *v1-- * tmp0;
1089:         sum2 -= *v2-- * tmp0;
1090:         sum3 -= *v3-- * tmp0;
1091:         sum4 -= *v4-- * tmp0;
1092:         sum5 -= *v5-- * tmp0;
1093:       }

1095:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1096:       sum2   -= *v2-- * tmp0;
1097:       sum3   -= *v3-- * tmp0;
1098:       sum4   -= *v4-- * tmp0;
1099:       sum5   -= *v5-- * tmp0;
1100:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1101:       sum3   -= *v3-- * tmp0;
1102:       sum4   -= *v4-- * tmp0;
1103:       sum5   -= *v5-- * tmp0;
1104:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105:       sum4   -= *v4-- * tmp0;
1106:       sum5   -= *v5-- * tmp0;
1107:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1108:       sum5   -= *v5-- * tmp0;
1109:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1110:       break;
1111:     default:
1112:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
1113:     }
1114:   }
1115:   ISRestoreIndices(isrow,&rout);
1116:   ISRestoreIndices(iscol,&cout);
1117:   VecRestoreArrayRead(bb,&b);
1118:   VecRestoreArrayWrite(xx,&x);
1119:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1120:   return 0;
1121: }

1123: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1124: {
1125:   Mat             C     =B;
1126:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1127:   IS              isrow = b->row,isicol = b->icol;
1128:   const PetscInt  *r,*ic,*ics;
1129:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1130:   PetscInt        i,j,k,nz,nzL,row,*pj;
1131:   const PetscInt  *ajtmp,*bjtmp;
1132:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1133:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1134:   FactorShiftCtx  sctx;
1135:   const PetscInt  *ddiag;
1136:   PetscReal       rs;
1137:   MatScalar       d;
1138:   PetscInt        inod,nodesz,node_max,col;
1139:   const PetscInt  *ns;
1140:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1142:   /* MatPivotSetUp(): initialize shift context sctx */
1143:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1145:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1146:     ddiag          = a->diag;
1147:     sctx.shift_top = info->zeropivot;
1148:     for (i=0; i<n; i++) {
1149:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1150:       d  = (aa)[ddiag[i]];
1151:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1152:       v  = aa+ai[i];
1153:       nz = ai[i+1] - ai[i];
1154:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1155:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1156:     }
1157:     sctx.shift_top *= 1.1;
1158:     sctx.nshift_max = 5;
1159:     sctx.shift_lo   = 0.;
1160:     sctx.shift_hi   = 1.;
1161:   }

1163:   ISGetIndices(isrow,&r);
1164:   ISGetIndices(isicol,&ic);

1166:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1167:   ics   = ic;

1169:   node_max = a->inode.node_count;
1170:   ns       = a->inode.size;

1173:   /* If max inode size > 4, split it into two inodes.*/
1174:   /* also map the inode sizes according to the ordering */
1175:   PetscMalloc1(n+1,&tmp_vec1);
1176:   for (i=0,j=0; i<node_max; ++i,++j) {
1177:     if (ns[i] > 4) {
1178:       tmp_vec1[j] = 4;
1179:       ++j;
1180:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1181:     } else {
1182:       tmp_vec1[j] = ns[i];
1183:     }
1184:   }
1185:   /* Use the correct node_max */
1186:   node_max = j;

1188:   /* Now reorder the inode info based on mat re-ordering info */
1189:   /* First create a row -> inode_size_array_index map */
1190:   PetscMalloc1(n+1,&nsmap);
1191:   PetscMalloc1(node_max+1,&tmp_vec2);
1192:   for (i=0,row=0; i<node_max; i++) {
1193:     nodesz = tmp_vec1[i];
1194:     for (j=0; j<nodesz; j++,row++) {
1195:       nsmap[row] = i;
1196:     }
1197:   }
1198:   /* Using nsmap, create a reordered ns structure */
1199:   for (i=0,j=0; i< node_max; i++) {
1200:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1201:     tmp_vec2[i] = nodesz;
1202:     j          += nodesz;
1203:   }
1204:   PetscFree(nsmap);
1205:   PetscFree(tmp_vec1);

1207:   /* Now use the correct ns */
1208:   ns = tmp_vec2;

1210:   do {
1211:     sctx.newshift = PETSC_FALSE;
1212:     /* Now loop over each block-row, and do the factorization */
1213:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1214:       nodesz = ns[inod];

1216:       switch (nodesz) {
1217:       case 1:
1218:         /*----------*/
1219:         /* zero rtmp1 */
1220:         /* L part */
1221:         nz    = bi[i+1] - bi[i];
1222:         bjtmp = bj + bi[i];
1223:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1225:         /* U part */
1226:         nz    = bdiag[i]-bdiag[i+1];
1227:         bjtmp = bj + bdiag[i+1]+1;
1228:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1230:         /* load in initial (unfactored row) */
1231:         nz    = ai[r[i]+1] - ai[r[i]];
1232:         ajtmp = aj + ai[r[i]];
1233:         v     = aa + ai[r[i]];
1234:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1236:         /* ZeropivotApply() */
1237:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1239:         /* elimination */
1240:         bjtmp = bj + bi[i];
1241:         row   = *bjtmp++;
1242:         nzL   = bi[i+1] - bi[i];
1243:         for (k=0; k < nzL; k++) {
1244:           pc = rtmp1 + row;
1245:           if (*pc != 0.0) {
1246:             pv   = b->a + bdiag[row];
1247:             mul1 = *pc * (*pv);
1248:             *pc  = mul1;
1249:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1250:             pv   = b->a + bdiag[row+1]+1;
1251:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1252:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1253:             PetscLogFlops(1+2.0*nz);
1254:           }
1255:           row = *bjtmp++;
1256:         }

1258:         /* finished row so stick it into b->a */
1259:         rs = 0.0;
1260:         /* L part */
1261:         pv = b->a + bi[i];
1262:         pj = b->j + bi[i];
1263:         nz = bi[i+1] - bi[i];
1264:         for (j=0; j<nz; j++) {
1265:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1266:         }

1268:         /* U part */
1269:         pv = b->a + bdiag[i+1]+1;
1270:         pj = b->j + bdiag[i+1]+1;
1271:         nz = bdiag[i] - bdiag[i+1]-1;
1272:         for (j=0; j<nz; j++) {
1273:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1274:         }

1276:         /* Check zero pivot */
1277:         sctx.rs = rs;
1278:         sctx.pv = rtmp1[i];
1279:         MatPivotCheck(B,A,info,&sctx,i);
1280:         if (sctx.newshift) break;

1282:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1283:         pv  = b->a + bdiag[i];
1284:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1285:         break;

1287:       case 2:
1288:         /*----------*/
1289:         /* zero rtmp1 and rtmp2 */
1290:         /* L part */
1291:         nz    = bi[i+1] - bi[i];
1292:         bjtmp = bj + bi[i];
1293:         for  (j=0; j<nz; j++) {
1294:           col        = bjtmp[j];
1295:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1296:         }

1298:         /* U part */
1299:         nz    = bdiag[i]-bdiag[i+1];
1300:         bjtmp = bj + bdiag[i+1]+1;
1301:         for  (j=0; j<nz; j++) {
1302:           col        = bjtmp[j];
1303:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1304:         }

1306:         /* load in initial (unfactored row) */
1307:         nz    = ai[r[i]+1] - ai[r[i]];
1308:         ajtmp = aj + ai[r[i]];
1309:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1310:         for (j=0; j<nz; j++) {
1311:           col        = ics[ajtmp[j]];
1312:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1313:         }
1314:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1315:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1317:         /* elimination */
1318:         bjtmp = bj + bi[i];
1319:         row   = *bjtmp++; /* pivot row */
1320:         nzL   = bi[i+1] - bi[i];
1321:         for (k=0; k < nzL; k++) {
1322:           pc1 = rtmp1 + row;
1323:           pc2 = rtmp2 + row;
1324:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1325:             pv   = b->a + bdiag[row];
1326:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1327:             *pc1 = mul1;       *pc2 = mul2;

1329:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1330:             pv = b->a + bdiag[row+1]+1;
1331:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1332:             for (j=0; j<nz; j++) {
1333:               col         = pj[j];
1334:               rtmp1[col] -= mul1 * pv[j];
1335:               rtmp2[col] -= mul2 * pv[j];
1336:             }
1337:             PetscLogFlops(2+4.0*nz);
1338:           }
1339:           row = *bjtmp++;
1340:         }

1342:         /* finished row i; check zero pivot, then stick row i into b->a */
1343:         rs = 0.0;
1344:         /* L part */
1345:         pc1 = b->a + bi[i];
1346:         pj  = b->j + bi[i];
1347:         nz  = bi[i+1] - bi[i];
1348:         for (j=0; j<nz; j++) {
1349:           col    = pj[j];
1350:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1351:         }
1352:         /* U part */
1353:         pc1 = b->a + bdiag[i+1]+1;
1354:         pj  = b->j + bdiag[i+1]+1;
1355:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1356:         for (j=0; j<nz; j++) {
1357:           col    = pj[j];
1358:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1359:         }

1361:         sctx.rs = rs;
1362:         sctx.pv = rtmp1[i];
1363:         MatPivotCheck(B,A,info,&sctx,i);
1364:         if (sctx.newshift) break;
1365:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1366:         *pc1 = 1.0/sctx.pv;

1368:         /* Now take care of diagonal 2x2 block. */
1369:         pc2 = rtmp2 + i;
1370:         if (*pc2 != 0.0) {
1371:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1372:           *pc2 = mul1;          /* insert L entry */
1373:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1374:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1375:           for (j=0; j<nz; j++) {
1376:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1377:           }
1378:           PetscLogFlops(1+2.0*nz);
1379:         }

1381:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1382:         rs = 0.0;
1383:         /* L part */
1384:         pc2 = b->a + bi[i+1];
1385:         pj  = b->j + bi[i+1];
1386:         nz  = bi[i+2] - bi[i+1];
1387:         for (j=0; j<nz; j++) {
1388:           col    = pj[j];
1389:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1390:         }
1391:         /* U part */
1392:         pc2 = b->a + bdiag[i+2]+1;
1393:         pj  = b->j + bdiag[i+2]+1;
1394:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1395:         for (j=0; j<nz; j++) {
1396:           col    = pj[j];
1397:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1398:         }

1400:         sctx.rs = rs;
1401:         sctx.pv = rtmp2[i+1];
1402:         MatPivotCheck(B,A,info,&sctx,i+1);
1403:         if (sctx.newshift) break;
1404:         pc2  = b->a + bdiag[i+1];
1405:         *pc2 = 1.0/sctx.pv;
1406:         break;

1408:       case 3:
1409:         /*----------*/
1410:         /* zero rtmp */
1411:         /* L part */
1412:         nz    = bi[i+1] - bi[i];
1413:         bjtmp = bj + bi[i];
1414:         for  (j=0; j<nz; j++) {
1415:           col        = bjtmp[j];
1416:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1417:         }

1419:         /* U part */
1420:         nz    = bdiag[i]-bdiag[i+1];
1421:         bjtmp = bj + bdiag[i+1]+1;
1422:         for  (j=0; j<nz; j++) {
1423:           col        = bjtmp[j];
1424:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1425:         }

1427:         /* load in initial (unfactored row) */
1428:         nz    = ai[r[i]+1] - ai[r[i]];
1429:         ajtmp = aj + ai[r[i]];
1430:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1431:         for (j=0; j<nz; j++) {
1432:           col        = ics[ajtmp[j]];
1433:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1434:         }
1435:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1436:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1438:         /* elimination */
1439:         bjtmp = bj + bi[i];
1440:         row   = *bjtmp++; /* pivot row */
1441:         nzL   = bi[i+1] - bi[i];
1442:         for (k=0; k < nzL; k++) {
1443:           pc1 = rtmp1 + row;
1444:           pc2 = rtmp2 + row;
1445:           pc3 = rtmp3 + row;
1446:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1447:             pv   = b->a + bdiag[row];
1448:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1449:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1451:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1452:             pv = b->a + bdiag[row+1]+1;
1453:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1454:             for (j=0; j<nz; j++) {
1455:               col         = pj[j];
1456:               rtmp1[col] -= mul1 * pv[j];
1457:               rtmp2[col] -= mul2 * pv[j];
1458:               rtmp3[col] -= mul3 * pv[j];
1459:             }
1460:             PetscLogFlops(3+6.0*nz);
1461:           }
1462:           row = *bjtmp++;
1463:         }

1465:         /* finished row i; check zero pivot, then stick row i into b->a */
1466:         rs = 0.0;
1467:         /* L part */
1468:         pc1 = b->a + bi[i];
1469:         pj  = b->j + bi[i];
1470:         nz  = bi[i+1] - bi[i];
1471:         for (j=0; j<nz; j++) {
1472:           col    = pj[j];
1473:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1474:         }
1475:         /* U part */
1476:         pc1 = b->a + bdiag[i+1]+1;
1477:         pj  = b->j + bdiag[i+1]+1;
1478:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1479:         for (j=0; j<nz; j++) {
1480:           col    = pj[j];
1481:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1482:         }

1484:         sctx.rs = rs;
1485:         sctx.pv = rtmp1[i];
1486:         MatPivotCheck(B,A,info,&sctx,i);
1487:         if (sctx.newshift) break;
1488:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1489:         *pc1 = 1.0/sctx.pv;

1491:         /* Now take care of 1st column of diagonal 3x3 block. */
1492:         pc2 = rtmp2 + i;
1493:         pc3 = rtmp3 + i;
1494:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1495:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1496:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1497:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1498:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1499:           for (j=0; j<nz; j++) {
1500:             col         = pj[j];
1501:             rtmp2[col] -= mul2 * rtmp1[col];
1502:             rtmp3[col] -= mul3 * rtmp1[col];
1503:           }
1504:           PetscLogFlops(2+4.0*nz);
1505:         }

1507:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1508:         rs = 0.0;
1509:         /* L part */
1510:         pc2 = b->a + bi[i+1];
1511:         pj  = b->j + bi[i+1];
1512:         nz  = bi[i+2] - bi[i+1];
1513:         for (j=0; j<nz; j++) {
1514:           col    = pj[j];
1515:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1516:         }
1517:         /* U part */
1518:         pc2 = b->a + bdiag[i+2]+1;
1519:         pj  = b->j + bdiag[i+2]+1;
1520:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1521:         for (j=0; j<nz; j++) {
1522:           col    = pj[j];
1523:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1524:         }

1526:         sctx.rs = rs;
1527:         sctx.pv = rtmp2[i+1];
1528:         MatPivotCheck(B,A,info,&sctx,i+1);
1529:         if (sctx.newshift) break;
1530:         pc2  = b->a + bdiag[i+1];
1531:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1533:         /* Now take care of 2nd column of diagonal 3x3 block. */
1534:         pc3 = rtmp3 + i+1;
1535:         if (*pc3 != 0.0) {
1536:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1537:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1538:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1539:           for (j=0; j<nz; j++) {
1540:             col         = pj[j];
1541:             rtmp3[col] -= mul3 * rtmp2[col];
1542:           }
1543:           PetscLogFlops(1+2.0*nz);
1544:         }

1546:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1547:         rs = 0.0;
1548:         /* L part */
1549:         pc3 = b->a + bi[i+2];
1550:         pj  = b->j + bi[i+2];
1551:         nz  = bi[i+3] - bi[i+2];
1552:         for (j=0; j<nz; j++) {
1553:           col    = pj[j];
1554:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1555:         }
1556:         /* U part */
1557:         pc3 = b->a + bdiag[i+3]+1;
1558:         pj  = b->j + bdiag[i+3]+1;
1559:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1560:         for (j=0; j<nz; j++) {
1561:           col    = pj[j];
1562:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1563:         }

1565:         sctx.rs = rs;
1566:         sctx.pv = rtmp3[i+2];
1567:         MatPivotCheck(B,A,info,&sctx,i+2);
1568:         if (sctx.newshift) break;
1569:         pc3  = b->a + bdiag[i+2];
1570:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1571:         break;
1572:       case 4:
1573:         /*----------*/
1574:         /* zero rtmp */
1575:         /* L part */
1576:         nz    = bi[i+1] - bi[i];
1577:         bjtmp = bj + bi[i];
1578:         for  (j=0; j<nz; j++) {
1579:           col        = bjtmp[j];
1580:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1581:         }

1583:         /* U part */
1584:         nz    = bdiag[i]-bdiag[i+1];
1585:         bjtmp = bj + bdiag[i+1]+1;
1586:         for  (j=0; j<nz; j++) {
1587:           col        = bjtmp[j];
1588:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1589:         }

1591:         /* load in initial (unfactored row) */
1592:         nz    = ai[r[i]+1] - ai[r[i]];
1593:         ajtmp = aj + ai[r[i]];
1594:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1595:         for (j=0; j<nz; j++) {
1596:           col        = ics[ajtmp[j]];
1597:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1598:         }
1599:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1600:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1602:         /* elimination */
1603:         bjtmp = bj + bi[i];
1604:         row   = *bjtmp++; /* pivot row */
1605:         nzL   = bi[i+1] - bi[i];
1606:         for (k=0; k < nzL; k++) {
1607:           pc1 = rtmp1 + row;
1608:           pc2 = rtmp2 + row;
1609:           pc3 = rtmp3 + row;
1610:           pc4 = rtmp4 + row;
1611:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1612:             pv   = b->a + bdiag[row];
1613:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1614:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1616:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1617:             pv = b->a + bdiag[row+1]+1;
1618:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1619:             for (j=0; j<nz; j++) {
1620:               col         = pj[j];
1621:               rtmp1[col] -= mul1 * pv[j];
1622:               rtmp2[col] -= mul2 * pv[j];
1623:               rtmp3[col] -= mul3 * pv[j];
1624:               rtmp4[col] -= mul4 * pv[j];
1625:             }
1626:             PetscLogFlops(4+8.0*nz);
1627:           }
1628:           row = *bjtmp++;
1629:         }

1631:         /* finished row i; check zero pivot, then stick row i into b->a */
1632:         rs = 0.0;
1633:         /* L part */
1634:         pc1 = b->a + bi[i];
1635:         pj  = b->j + bi[i];
1636:         nz  = bi[i+1] - bi[i];
1637:         for (j=0; j<nz; j++) {
1638:           col    = pj[j];
1639:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1640:         }
1641:         /* U part */
1642:         pc1 = b->a + bdiag[i+1]+1;
1643:         pj  = b->j + bdiag[i+1]+1;
1644:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1645:         for (j=0; j<nz; j++) {
1646:           col    = pj[j];
1647:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1648:         }

1650:         sctx.rs = rs;
1651:         sctx.pv = rtmp1[i];
1652:         MatPivotCheck(B,A,info,&sctx,i);
1653:         if (sctx.newshift) break;
1654:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1655:         *pc1 = 1.0/sctx.pv;

1657:         /* Now take care of 1st column of diagonal 4x4 block. */
1658:         pc2 = rtmp2 + i;
1659:         pc3 = rtmp3 + i;
1660:         pc4 = rtmp4 + i;
1661:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1662:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1663:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1664:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1665:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1666:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1667:           for (j=0; j<nz; j++) {
1668:             col         = pj[j];
1669:             rtmp2[col] -= mul2 * rtmp1[col];
1670:             rtmp3[col] -= mul3 * rtmp1[col];
1671:             rtmp4[col] -= mul4 * rtmp1[col];
1672:           }
1673:           PetscLogFlops(3+6.0*nz);
1674:         }

1676:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1677:         rs = 0.0;
1678:         /* L part */
1679:         pc2 = b->a + bi[i+1];
1680:         pj  = b->j + bi[i+1];
1681:         nz  = bi[i+2] - bi[i+1];
1682:         for (j=0; j<nz; j++) {
1683:           col    = pj[j];
1684:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1685:         }
1686:         /* U part */
1687:         pc2 = b->a + bdiag[i+2]+1;
1688:         pj  = b->j + bdiag[i+2]+1;
1689:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1690:         for (j=0; j<nz; j++) {
1691:           col    = pj[j];
1692:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1693:         }

1695:         sctx.rs = rs;
1696:         sctx.pv = rtmp2[i+1];
1697:         MatPivotCheck(B,A,info,&sctx,i+1);
1698:         if (sctx.newshift) break;
1699:         pc2  = b->a + bdiag[i+1];
1700:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1702:         /* Now take care of 2nd column of diagonal 4x4 block. */
1703:         pc3 = rtmp3 + i+1;
1704:         pc4 = rtmp4 + i+1;
1705:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1706:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1707:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1708:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1709:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1710:           for (j=0; j<nz; j++) {
1711:             col         = pj[j];
1712:             rtmp3[col] -= mul3 * rtmp2[col];
1713:             rtmp4[col] -= mul4 * rtmp2[col];
1714:           }
1715:           PetscLogFlops(4.0*nz);
1716:         }

1718:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1719:         rs = 0.0;
1720:         /* L part */
1721:         pc3 = b->a + bi[i+2];
1722:         pj  = b->j + bi[i+2];
1723:         nz  = bi[i+3] - bi[i+2];
1724:         for (j=0; j<nz; j++) {
1725:           col    = pj[j];
1726:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1727:         }
1728:         /* U part */
1729:         pc3 = b->a + bdiag[i+3]+1;
1730:         pj  = b->j + bdiag[i+3]+1;
1731:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1732:         for (j=0; j<nz; j++) {
1733:           col    = pj[j];
1734:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1735:         }

1737:         sctx.rs = rs;
1738:         sctx.pv = rtmp3[i+2];
1739:         MatPivotCheck(B,A,info,&sctx,i+2);
1740:         if (sctx.newshift) break;
1741:         pc3  = b->a + bdiag[i+2];
1742:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1744:         /* Now take care of 3rd column of diagonal 4x4 block. */
1745:         pc4 = rtmp4 + i+2;
1746:         if (*pc4 != 0.0) {
1747:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1748:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1749:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1750:           for (j=0; j<nz; j++) {
1751:             col         = pj[j];
1752:             rtmp4[col] -= mul4 * rtmp3[col];
1753:           }
1754:           PetscLogFlops(1+2.0*nz);
1755:         }

1757:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1758:         rs = 0.0;
1759:         /* L part */
1760:         pc4 = b->a + bi[i+3];
1761:         pj  = b->j + bi[i+3];
1762:         nz  = bi[i+4] - bi[i+3];
1763:         for (j=0; j<nz; j++) {
1764:           col    = pj[j];
1765:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1766:         }
1767:         /* U part */
1768:         pc4 = b->a + bdiag[i+4]+1;
1769:         pj  = b->j + bdiag[i+4]+1;
1770:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1771:         for (j=0; j<nz; j++) {
1772:           col    = pj[j];
1773:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1774:         }

1776:         sctx.rs = rs;
1777:         sctx.pv = rtmp4[i+3];
1778:         MatPivotCheck(B,A,info,&sctx,i+3);
1779:         if (sctx.newshift) break;
1780:         pc4  = b->a + bdiag[i+3];
1781:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1782:         break;

1784:       default:
1785:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
1786:       }
1787:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1788:       i += nodesz;                 /* Update the row */
1789:     }

1791:     /* MatPivotRefine() */
1792:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1793:       /*
1794:        * if no shift in this attempt & shifting & started shifting & can refine,
1795:        * then try lower shift
1796:        */
1797:       sctx.shift_hi       = sctx.shift_fraction;
1798:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1799:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1800:       sctx.newshift       = PETSC_TRUE;
1801:       sctx.nshift++;
1802:     }
1803:   } while (sctx.newshift);

1805:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1806:   PetscFree(tmp_vec2);
1807:   ISRestoreIndices(isicol,&ic);
1808:   ISRestoreIndices(isrow,&r);

1810:   if (b->inode.size) {
1811:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1812:   } else {
1813:     C->ops->solve           = MatSolve_SeqAIJ;
1814:   }
1815:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1816:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1817:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1818:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1819:   C->assembled              = PETSC_TRUE;
1820:   C->preallocated           = PETSC_TRUE;

1822:   PetscLogFlops(C->cmap->n);

1824:   /* MatShiftView(A,info,&sctx) */
1825:   if (sctx.nshift) {
1826:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1827:       PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1828:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1829:       PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1830:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1831:       PetscInfo(A,"number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1832:     }
1833:   }
1834:   return 0;
1835: }

1837: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1838: {
1839:   Mat             C     = B;
1840:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1841:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1842:   const PetscInt  *r,*ic,*c,*ics;
1843:   PetscInt        n   = A->rmap->n,*bi = b->i;
1844:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1845:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1846:   PetscInt        *ai = a->i,*aj = a->j;
1847:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1848:   PetscScalar     mul1,mul2,mul3,tmp;
1849:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1850:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1851:   PetscReal       rs=0.0;
1852:   FactorShiftCtx  sctx;

1854:   sctx.shift_top      = 0;
1855:   sctx.nshift_max     = 0;
1856:   sctx.shift_lo       = 0;
1857:   sctx.shift_hi       = 0;
1858:   sctx.shift_fraction = 0;

1860:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1861:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1862:     sctx.shift_top = 0;
1863:     for (i=0; i<n; i++) {
1864:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1865:       rs    = 0.0;
1866:       ajtmp = aj + ai[i];
1867:       rtmp1 = aa + ai[i];
1868:       nz    = ai[i+1] - ai[i];
1869:       for (j=0; j<nz; j++) {
1870:         if (*ajtmp != i) {
1871:           rs += PetscAbsScalar(*rtmp1++);
1872:         } else {
1873:           rs -= PetscRealPart(*rtmp1++);
1874:         }
1875:         ajtmp++;
1876:       }
1877:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1878:     }
1879:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1880:     sctx.shift_top *= 1.1;
1881:     sctx.nshift_max = 5;
1882:     sctx.shift_lo   = 0.;
1883:     sctx.shift_hi   = 1.;
1884:   }
1885:   sctx.shift_amount = 0;
1886:   sctx.nshift       = 0;

1888:   ISGetIndices(isrow,&r);
1889:   ISGetIndices(iscol,&c);
1890:   ISGetIndices(isicol,&ic);
1891:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1892:   ics    = ic;

1894:   node_max = a->inode.node_count;
1895:   ns       = a->inode.size;

1898:   /* If max inode size > 3, split it into two inodes.*/
1899:   /* also map the inode sizes according to the ordering */
1900:   PetscMalloc1(n+1,&tmp_vec1);
1901:   for (i=0,j=0; i<node_max; ++i,++j) {
1902:     if (ns[i]>3) {
1903:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1904:       ++j;
1905:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1906:     } else {
1907:       tmp_vec1[j] = ns[i];
1908:     }
1909:   }
1910:   /* Use the correct node_max */
1911:   node_max = j;

1913:   /* Now reorder the inode info based on mat re-ordering info */
1914:   /* First create a row -> inode_size_array_index map */
1915:   PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1916:   for (i=0,row=0; i<node_max; i++) {
1917:     nodesz = tmp_vec1[i];
1918:     for (j=0; j<nodesz; j++,row++) {
1919:       nsmap[row] = i;
1920:     }
1921:   }
1922:   /* Using nsmap, create a reordered ns structure */
1923:   for (i=0,j=0; i< node_max; i++) {
1924:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1925:     tmp_vec2[i] = nodesz;
1926:     j          += nodesz;
1927:   }
1928:   PetscFree2(nsmap,tmp_vec1);
1929:   /* Now use the correct ns */
1930:   ns = tmp_vec2;

1932:   do {
1933:     sctx.newshift = PETSC_FALSE;
1934:     /* Now loop over each block-row, and do the factorization */
1935:     for (i=0,row=0; i<node_max; i++) {
1936:       nodesz = ns[i];
1937:       nz     = bi[row+1] - bi[row];
1938:       bjtmp  = bj + bi[row];

1940:       switch (nodesz) {
1941:       case 1:
1942:         for  (j=0; j<nz; j++) {
1943:           idx         = bjtmp[j];
1944:           rtmp11[idx] = 0.0;
1945:         }

1947:         /* load in initial (unfactored row) */
1948:         idx    = r[row];
1949:         nz_tmp = ai[idx+1] - ai[idx];
1950:         ajtmp  = aj + ai[idx];
1951:         v1     = aa + ai[idx];

1953:         for (j=0; j<nz_tmp; j++) {
1954:           idx         = ics[ajtmp[j]];
1955:           rtmp11[idx] = v1[j];
1956:         }
1957:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1959:         prow = *bjtmp++;
1960:         while (prow < row) {
1961:           pc1 = rtmp11 + prow;
1962:           if (*pc1 != 0.0) {
1963:             pv     = ba + bd[prow];
1964:             pj     = nbj + bd[prow];
1965:             mul1   = *pc1 * *pv++;
1966:             *pc1   = mul1;
1967:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1968:             PetscLogFlops(1+2.0*nz_tmp);
1969:             for (j=0; j<nz_tmp; j++) {
1970:               tmp          = pv[j];
1971:               idx          = pj[j];
1972:               rtmp11[idx] -= mul1 * tmp;
1973:             }
1974:           }
1975:           prow = *bjtmp++;
1976:         }
1977:         pj  = bj + bi[row];
1978:         pc1 = ba + bi[row];

1980:         sctx.pv     = rtmp11[row];
1981:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1982:         rs          = 0.0;
1983:         for (j=0; j<nz; j++) {
1984:           idx    = pj[j];
1985:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1986:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1987:         }
1988:         sctx.rs = rs;
1989:         MatPivotCheck(B,A,info,&sctx,row);
1990:         if (sctx.newshift) goto endofwhile;
1991:         break;

1993:       case 2:
1994:         for (j=0; j<nz; j++) {
1995:           idx         = bjtmp[j];
1996:           rtmp11[idx] = 0.0;
1997:           rtmp22[idx] = 0.0;
1998:         }

2000:         /* load in initial (unfactored row) */
2001:         idx    = r[row];
2002:         nz_tmp = ai[idx+1] - ai[idx];
2003:         ajtmp  = aj + ai[idx];
2004:         v1     = aa + ai[idx];
2005:         v2     = aa + ai[idx+1];
2006:         for (j=0; j<nz_tmp; j++) {
2007:           idx         = ics[ajtmp[j]];
2008:           rtmp11[idx] = v1[j];
2009:           rtmp22[idx] = v2[j];
2010:         }
2011:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2012:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2014:         prow = *bjtmp++;
2015:         while (prow < row) {
2016:           pc1 = rtmp11 + prow;
2017:           pc2 = rtmp22 + prow;
2018:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2019:             pv   = ba + bd[prow];
2020:             pj   = nbj + bd[prow];
2021:             mul1 = *pc1 * *pv;
2022:             mul2 = *pc2 * *pv;
2023:             ++pv;
2024:             *pc1 = mul1;
2025:             *pc2 = mul2;

2027:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2028:             for (j=0; j<nz_tmp; j++) {
2029:               tmp          = pv[j];
2030:               idx          = pj[j];
2031:               rtmp11[idx] -= mul1 * tmp;
2032:               rtmp22[idx] -= mul2 * tmp;
2033:             }
2034:             PetscLogFlops(2+4.0*nz_tmp);
2035:           }
2036:           prow = *bjtmp++;
2037:         }

2039:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2040:         pc1 = rtmp11 + prow;
2041:         pc2 = rtmp22 + prow;

2043:         sctx.pv = *pc1;
2044:         pj      = bj + bi[prow];
2045:         rs      = 0.0;
2046:         for (j=0; j<nz; j++) {
2047:           idx = pj[j];
2048:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2049:         }
2050:         sctx.rs = rs;
2051:         MatPivotCheck(B,A,info,&sctx,row);
2052:         if (sctx.newshift) goto endofwhile;

2054:         if (*pc2 != 0.0) {
2055:           pj     = nbj + bd[prow];
2056:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2057:           *pc2   = mul2;
2058:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2059:           for (j=0; j<nz_tmp; j++) {
2060:             idx          = pj[j];
2061:             tmp          = rtmp11[idx];
2062:             rtmp22[idx] -= mul2 * tmp;
2063:           }
2064:           PetscLogFlops(1+2.0*nz_tmp);
2065:         }

2067:         pj  = bj + bi[row];
2068:         pc1 = ba + bi[row];
2069:         pc2 = ba + bi[row+1];

2071:         sctx.pv       = rtmp22[row+1];
2072:         rs            = 0.0;
2073:         rtmp11[row]   = 1.0/rtmp11[row];
2074:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2075:         /* copy row entries from dense representation to sparse */
2076:         for (j=0; j<nz; j++) {
2077:           idx    = pj[j];
2078:           pc1[j] = rtmp11[idx];
2079:           pc2[j] = rtmp22[idx];
2080:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2081:         }
2082:         sctx.rs = rs;
2083:         MatPivotCheck(B,A,info,&sctx,row+1);
2084:         if (sctx.newshift) goto endofwhile;
2085:         break;

2087:       case 3:
2088:         for  (j=0; j<nz; j++) {
2089:           idx         = bjtmp[j];
2090:           rtmp11[idx] = 0.0;
2091:           rtmp22[idx] = 0.0;
2092:           rtmp33[idx] = 0.0;
2093:         }
2094:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2095:         idx    = r[row];
2096:         nz_tmp = ai[idx+1] - ai[idx];
2097:         ajtmp  = aj + ai[idx];
2098:         v1     = aa + ai[idx];
2099:         v2     = aa + ai[idx+1];
2100:         v3     = aa + ai[idx+2];
2101:         for (j=0; j<nz_tmp; j++) {
2102:           idx         = ics[ajtmp[j]];
2103:           rtmp11[idx] = v1[j];
2104:           rtmp22[idx] = v2[j];
2105:           rtmp33[idx] = v3[j];
2106:         }
2107:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2108:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2109:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2111:         /* loop over all pivot row blocks above this row block */
2112:         prow = *bjtmp++;
2113:         while (prow < row) {
2114:           pc1 = rtmp11 + prow;
2115:           pc2 = rtmp22 + prow;
2116:           pc3 = rtmp33 + prow;
2117:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2118:             pv   = ba  + bd[prow];
2119:             pj   = nbj + bd[prow];
2120:             mul1 = *pc1 * *pv;
2121:             mul2 = *pc2 * *pv;
2122:             mul3 = *pc3 * *pv;
2123:             ++pv;
2124:             *pc1 = mul1;
2125:             *pc2 = mul2;
2126:             *pc3 = mul3;

2128:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2129:             /* update this row based on pivot row */
2130:             for (j=0; j<nz_tmp; j++) {
2131:               tmp          = pv[j];
2132:               idx          = pj[j];
2133:               rtmp11[idx] -= mul1 * tmp;
2134:               rtmp22[idx] -= mul2 * tmp;
2135:               rtmp33[idx] -= mul3 * tmp;
2136:             }
2137:             PetscLogFlops(3+6.0*nz_tmp);
2138:           }
2139:           prow = *bjtmp++;
2140:         }

2142:         /* Now take care of diagonal 3x3 block in this set of rows */
2143:         /* note: prow = row here */
2144:         pc1 = rtmp11 + prow;
2145:         pc2 = rtmp22 + prow;
2146:         pc3 = rtmp33 + prow;

2148:         sctx.pv = *pc1;
2149:         pj      = bj + bi[prow];
2150:         rs      = 0.0;
2151:         for (j=0; j<nz; j++) {
2152:           idx = pj[j];
2153:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2154:         }
2155:         sctx.rs = rs;
2156:         MatPivotCheck(B,A,info,&sctx,row);
2157:         if (sctx.newshift) goto endofwhile;

2159:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2160:           mul2   = (*pc2)/(*pc1);
2161:           mul3   = (*pc3)/(*pc1);
2162:           *pc2   = mul2;
2163:           *pc3   = mul3;
2164:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2165:           pj     = nbj + bd[prow];
2166:           for (j=0; j<nz_tmp; j++) {
2167:             idx          = pj[j];
2168:             tmp          = rtmp11[idx];
2169:             rtmp22[idx] -= mul2 * tmp;
2170:             rtmp33[idx] -= mul3 * tmp;
2171:           }
2172:           PetscLogFlops(2+4.0*nz_tmp);
2173:         }
2174:         ++prow;

2176:         pc2     = rtmp22 + prow;
2177:         pc3     = rtmp33 + prow;
2178:         sctx.pv = *pc2;
2179:         pj      = bj + bi[prow];
2180:         rs      = 0.0;
2181:         for (j=0; j<nz; j++) {
2182:           idx = pj[j];
2183:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2184:         }
2185:         sctx.rs = rs;
2186:         MatPivotCheck(B,A,info,&sctx,row+1);
2187:         if (sctx.newshift) goto endofwhile;

2189:         if (*pc3 != 0.0) {
2190:           mul3   = (*pc3)/(*pc2);
2191:           *pc3   = mul3;
2192:           pj     = nbj + bd[prow];
2193:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2194:           for (j=0; j<nz_tmp; j++) {
2195:             idx          = pj[j];
2196:             tmp          = rtmp22[idx];
2197:             rtmp33[idx] -= mul3 * tmp;
2198:           }
2199:           PetscLogFlops(1+2.0*nz_tmp);
2200:         }

2202:         pj  = bj + bi[row];
2203:         pc1 = ba + bi[row];
2204:         pc2 = ba + bi[row+1];
2205:         pc3 = ba + bi[row+2];

2207:         sctx.pv       = rtmp33[row+2];
2208:         rs            = 0.0;
2209:         rtmp11[row]   = 1.0/rtmp11[row];
2210:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2211:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2212:         /* copy row entries from dense representation to sparse */
2213:         for (j=0; j<nz; j++) {
2214:           idx    = pj[j];
2215:           pc1[j] = rtmp11[idx];
2216:           pc2[j] = rtmp22[idx];
2217:           pc3[j] = rtmp33[idx];
2218:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2219:         }

2221:         sctx.rs = rs;
2222:         MatPivotCheck(B,A,info,&sctx,row+2);
2223:         if (sctx.newshift) goto endofwhile;
2224:         break;

2226:       default:
2227:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
2228:       }
2229:       row += nodesz;                 /* Update the row */
2230:     }
2231: endofwhile:;
2232:   } while (sctx.newshift);
2233:   PetscFree3(rtmp11,rtmp22,rtmp33);
2234:   PetscFree(tmp_vec2);
2235:   ISRestoreIndices(isicol,&ic);
2236:   ISRestoreIndices(isrow,&r);
2237:   ISRestoreIndices(iscol,&c);

2239:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2240:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2241:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2242:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2243:   C->assembled              = PETSC_TRUE;
2244:   C->preallocated           = PETSC_TRUE;
2245:   if (sctx.nshift) {
2246:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2247:       PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2248:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2249:       PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2250:     }
2251:   }
2252:   PetscLogFlops(C->cmap->n);
2253:   MatSeqAIJCheckInode(C);
2254:   return 0;
2255: }

2257: /* ----------------------------------------------------------- */
2258: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2259: {
2260:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2261:   IS                iscol = a->col,isrow = a->row;
2262:   const PetscInt    *r,*c,*rout,*cout;
2263:   PetscInt          i,j,n = A->rmap->n;
2264:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2265:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2266:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2267:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2268:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2269:   const PetscScalar *b;

2272:   node_max = a->inode.node_count;
2273:   ns       = a->inode.size;     /* Node Size array */

2275:   VecGetArrayRead(bb,&b);
2276:   VecGetArrayWrite(xx,&x);
2277:   tmp  = a->solve_work;

2279:   ISGetIndices(isrow,&rout); r = rout;
2280:   ISGetIndices(iscol,&cout); c = cout;

2282:   /* forward solve the lower triangular */
2283:   tmps = tmp;
2284:   aa   = a_a;
2285:   aj   = a_j;
2286:   ad   = a->diag;

2288:   for (i = 0,row = 0; i< node_max; ++i) {
2289:     nsz = ns[i];
2290:     aii = ai[row];
2291:     v1  = aa + aii;
2292:     vi  = aj + aii;
2293:     nz  = ai[row+1]- ai[row];

2295:     if (i < node_max-1) {
2296:       /* Prefetch the indices for the next block */
2297:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2298:       /* Prefetch the data for the next block */
2299:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2300:     }

2302:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2303:     case 1:
2304:       sum1 = b[r[row]];
2305:       for (j=0; j<nz-1; j+=2) {
2306:         i0    = vi[j];
2307:         i1    = vi[j+1];
2308:         tmp0  = tmps[i0];
2309:         tmp1  = tmps[i1];
2310:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2311:       }
2312:       if (j == nz-1) {
2313:         tmp0  = tmps[vi[j]];
2314:         sum1 -= v1[j]*tmp0;
2315:       }
2316:       tmp[row++]=sum1;
2317:       break;
2318:     case 2:
2319:       sum1 = b[r[row]];
2320:       sum2 = b[r[row+1]];
2321:       v2   = aa + ai[row+1];

2323:       for (j=0; j<nz-1; j+=2) {
2324:         i0    = vi[j];
2325:         i1    = vi[j+1];
2326:         tmp0  = tmps[i0];
2327:         tmp1  = tmps[i1];
2328:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2329:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2330:       }
2331:       if (j == nz-1) {
2332:         tmp0  = tmps[vi[j]];
2333:         sum1 -= v1[j] *tmp0;
2334:         sum2 -= v2[j] *tmp0;
2335:       }
2336:       sum2     -= v2[nz] * sum1;
2337:       tmp[row++]=sum1;
2338:       tmp[row++]=sum2;
2339:       break;
2340:     case 3:
2341:       sum1 = b[r[row]];
2342:       sum2 = b[r[row+1]];
2343:       sum3 = b[r[row+2]];
2344:       v2   = aa + ai[row+1];
2345:       v3   = aa + ai[row+2];

2347:       for (j=0; j<nz-1; j+=2) {
2348:         i0    = vi[j];
2349:         i1    = vi[j+1];
2350:         tmp0  = tmps[i0];
2351:         tmp1  = tmps[i1];
2352:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2353:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2354:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2355:       }
2356:       if (j == nz-1) {
2357:         tmp0  = tmps[vi[j]];
2358:         sum1 -= v1[j] *tmp0;
2359:         sum2 -= v2[j] *tmp0;
2360:         sum3 -= v3[j] *tmp0;
2361:       }
2362:       sum2     -= v2[nz] * sum1;
2363:       sum3     -= v3[nz] * sum1;
2364:       sum3     -= v3[nz+1] * sum2;
2365:       tmp[row++]=sum1;
2366:       tmp[row++]=sum2;
2367:       tmp[row++]=sum3;
2368:       break;

2370:     case 4:
2371:       sum1 = b[r[row]];
2372:       sum2 = b[r[row+1]];
2373:       sum3 = b[r[row+2]];
2374:       sum4 = b[r[row+3]];
2375:       v2   = aa + ai[row+1];
2376:       v3   = aa + ai[row+2];
2377:       v4   = aa + ai[row+3];

2379:       for (j=0; j<nz-1; j+=2) {
2380:         i0    = vi[j];
2381:         i1    = vi[j+1];
2382:         tmp0  = tmps[i0];
2383:         tmp1  = tmps[i1];
2384:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2385:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2386:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2387:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2388:       }
2389:       if (j == nz-1) {
2390:         tmp0  = tmps[vi[j]];
2391:         sum1 -= v1[j] *tmp0;
2392:         sum2 -= v2[j] *tmp0;
2393:         sum3 -= v3[j] *tmp0;
2394:         sum4 -= v4[j] *tmp0;
2395:       }
2396:       sum2 -= v2[nz] * sum1;
2397:       sum3 -= v3[nz] * sum1;
2398:       sum4 -= v4[nz] * sum1;
2399:       sum3 -= v3[nz+1] * sum2;
2400:       sum4 -= v4[nz+1] * sum2;
2401:       sum4 -= v4[nz+2] * sum3;

2403:       tmp[row++]=sum1;
2404:       tmp[row++]=sum2;
2405:       tmp[row++]=sum3;
2406:       tmp[row++]=sum4;
2407:       break;
2408:     case 5:
2409:       sum1 = b[r[row]];
2410:       sum2 = b[r[row+1]];
2411:       sum3 = b[r[row+2]];
2412:       sum4 = b[r[row+3]];
2413:       sum5 = b[r[row+4]];
2414:       v2   = aa + ai[row+1];
2415:       v3   = aa + ai[row+2];
2416:       v4   = aa + ai[row+3];
2417:       v5   = aa + ai[row+4];

2419:       for (j=0; j<nz-1; j+=2) {
2420:         i0    = vi[j];
2421:         i1    = vi[j+1];
2422:         tmp0  = tmps[i0];
2423:         tmp1  = tmps[i1];
2424:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2429:       }
2430:       if (j == nz-1) {
2431:         tmp0  = tmps[vi[j]];
2432:         sum1 -= v1[j] *tmp0;
2433:         sum2 -= v2[j] *tmp0;
2434:         sum3 -= v3[j] *tmp0;
2435:         sum4 -= v4[j] *tmp0;
2436:         sum5 -= v5[j] *tmp0;
2437:       }

2439:       sum2 -= v2[nz] * sum1;
2440:       sum3 -= v3[nz] * sum1;
2441:       sum4 -= v4[nz] * sum1;
2442:       sum5 -= v5[nz] * sum1;
2443:       sum3 -= v3[nz+1] * sum2;
2444:       sum4 -= v4[nz+1] * sum2;
2445:       sum5 -= v5[nz+1] * sum2;
2446:       sum4 -= v4[nz+2] * sum3;
2447:       sum5 -= v5[nz+2] * sum3;
2448:       sum5 -= v5[nz+3] * sum4;

2450:       tmp[row++]=sum1;
2451:       tmp[row++]=sum2;
2452:       tmp[row++]=sum3;
2453:       tmp[row++]=sum4;
2454:       tmp[row++]=sum5;
2455:       break;
2456:     default:
2457:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2458:     }
2459:   }
2460:   /* backward solve the upper triangular */
2461:   for (i=node_max -1,row = n-1; i>=0; i--) {
2462:     nsz = ns[i];
2463:     aii = ad[row+1] + 1;
2464:     v1  = aa + aii;
2465:     vi  = aj + aii;
2466:     nz  = ad[row]- ad[row+1] - 1;

2468:     if (i > 0) {
2469:       /* Prefetch the indices for the next block */
2470:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2471:       /* Prefetch the data for the next block */
2472:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2473:     }

2475:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2476:     case 1:
2477:       sum1 = tmp[row];

2479:       for (j=0; j<nz-1; j+=2) {
2480:         i0    = vi[j];
2481:         i1    = vi[j+1];
2482:         tmp0  = tmps[i0];
2483:         tmp1  = tmps[i1];
2484:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2485:       }
2486:       if (j == nz-1) {
2487:         tmp0  = tmps[vi[j]];
2488:         sum1 -= v1[j]*tmp0;
2489:       }
2490:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2491:       break;
2492:     case 2:
2493:       sum1 = tmp[row];
2494:       sum2 = tmp[row-1];
2495:       v2   = aa + ad[row] + 1;
2496:       for (j=0; j<nz-1; j+=2) {
2497:         i0    = vi[j];
2498:         i1    = vi[j+1];
2499:         tmp0  = tmps[i0];
2500:         tmp1  = tmps[i1];
2501:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2502:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2503:       }
2504:       if (j == nz-1) {
2505:         tmp0  = tmps[vi[j]];
2506:         sum1 -= v1[j]* tmp0;
2507:         sum2 -= v2[j+1]* tmp0;
2508:       }

2510:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2511:       sum2     -= v2[0] * tmp0;
2512:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2513:       break;
2514:     case 3:
2515:       sum1 = tmp[row];
2516:       sum2 = tmp[row -1];
2517:       sum3 = tmp[row -2];
2518:       v2   = aa + ad[row] + 1;
2519:       v3   = aa + ad[row -1] + 1;
2520:       for (j=0; j<nz-1; j+=2) {
2521:         i0    = vi[j];
2522:         i1    = vi[j+1];
2523:         tmp0  = tmps[i0];
2524:         tmp1  = tmps[i1];
2525:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2526:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2527:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2528:       }
2529:       if (j== nz-1) {
2530:         tmp0  = tmps[vi[j]];
2531:         sum1 -= v1[j] * tmp0;
2532:         sum2 -= v2[j+1] * tmp0;
2533:         sum3 -= v3[j+2] * tmp0;
2534:       }
2535:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2536:       sum2     -= v2[0]* tmp0;
2537:       sum3     -= v3[1] * tmp0;
2538:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2539:       sum3     -= v3[0]* tmp0;
2540:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2542:       break;
2543:     case 4:
2544:       sum1 = tmp[row];
2545:       sum2 = tmp[row -1];
2546:       sum3 = tmp[row -2];
2547:       sum4 = tmp[row -3];
2548:       v2   = aa + ad[row]+1;
2549:       v3   = aa + ad[row -1]+1;
2550:       v4   = aa + ad[row -2]+1;

2552:       for (j=0; j<nz-1; j+=2) {
2553:         i0    = vi[j];
2554:         i1    = vi[j+1];
2555:         tmp0  = tmps[i0];
2556:         tmp1  = tmps[i1];
2557:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2558:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2559:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2560:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2561:       }
2562:       if (j== nz-1) {
2563:         tmp0  = tmps[vi[j]];
2564:         sum1 -= v1[j] * tmp0;
2565:         sum2 -= v2[j+1] * tmp0;
2566:         sum3 -= v3[j+2] * tmp0;
2567:         sum4 -= v4[j+3] * tmp0;
2568:       }

2570:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2571:       sum2     -= v2[0] * tmp0;
2572:       sum3     -= v3[1] * tmp0;
2573:       sum4     -= v4[2] * tmp0;
2574:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2575:       sum3     -= v3[0] * tmp0;
2576:       sum4     -= v4[1] * tmp0;
2577:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2578:       sum4     -= v4[0] * tmp0;
2579:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2580:       break;
2581:     case 5:
2582:       sum1 = tmp[row];
2583:       sum2 = tmp[row -1];
2584:       sum3 = tmp[row -2];
2585:       sum4 = tmp[row -3];
2586:       sum5 = tmp[row -4];
2587:       v2   = aa + ad[row]+1;
2588:       v3   = aa + ad[row -1]+1;
2589:       v4   = aa + ad[row -2]+1;
2590:       v5   = aa + ad[row -3]+1;
2591:       for (j=0; j<nz-1; j+=2) {
2592:         i0    = vi[j];
2593:         i1    = vi[j+1];
2594:         tmp0  = tmps[i0];
2595:         tmp1  = tmps[i1];
2596:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2597:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2598:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2599:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2600:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2601:       }
2602:       if (j==nz-1) {
2603:         tmp0  = tmps[vi[j]];
2604:         sum1 -= v1[j] * tmp0;
2605:         sum2 -= v2[j+1] * tmp0;
2606:         sum3 -= v3[j+2] * tmp0;
2607:         sum4 -= v4[j+3] * tmp0;
2608:         sum5 -= v5[j+4] * tmp0;
2609:       }

2611:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2612:       sum2     -= v2[0] * tmp0;
2613:       sum3     -= v3[1] * tmp0;
2614:       sum4     -= v4[2] * tmp0;
2615:       sum5     -= v5[3] * tmp0;
2616:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2617:       sum3     -= v3[0] * tmp0;
2618:       sum4     -= v4[1] * tmp0;
2619:       sum5     -= v5[2] * tmp0;
2620:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2621:       sum4     -= v4[0] * tmp0;
2622:       sum5     -= v5[1] * tmp0;
2623:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2624:       sum5     -= v5[0] * tmp0;
2625:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2626:       break;
2627:     default:
2628:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2629:     }
2630:   }
2631:   ISRestoreIndices(isrow,&rout);
2632:   ISRestoreIndices(iscol,&cout);
2633:   VecRestoreArrayRead(bb,&b);
2634:   VecRestoreArrayWrite(xx,&x);
2635:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2636:   return 0;
2637: }

2639: /*
2640:      Makes a longer coloring[] array and calls the usual code with that
2641: */
2642: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2643: {
2644:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2645:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2646:   PetscInt        *colorused,i;
2647:   ISColoringValue *newcolor;

2650:   PetscMalloc1(n+1,&newcolor);
2651:   /* loop over inodes, marking a color for each column*/
2652:   row = 0;
2653:   for (i=0; i<m; i++) {
2654:     for (j=0; j<ns[i]; j++) {
2655:       newcolor[row++] = coloring[i] + j*ncolors;
2656:     }
2657:   }

2659:   /* eliminate unneeded colors */
2660:   PetscCalloc1(5*ncolors,&colorused);
2661:   for (i=0; i<n; i++) {
2662:     colorused[newcolor[i]] = 1;
2663:   }

2665:   for (i=1; i<5*ncolors; i++) {
2666:     colorused[i] += colorused[i-1];
2667:   }
2668:   ncolors = colorused[5*ncolors-1];
2669:   for (i=0; i<n; i++) {
2670:     newcolor[i] = colorused[newcolor[i]]-1;
2671:   }
2672:   PetscFree(colorused);
2673:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2674:   PetscFree(coloring);
2675:   return 0;
2676: }

2678: #include <petsc/private/kernels/blockinvert.h>

2680: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2681: {
2682:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2683:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2684:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2685:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2686:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2687:   const PetscScalar *xb, *b;
2688:   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2689:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2690:   PetscInt          sz,k,ipvt[5];
2691:   PetscBool         allowzeropivot,zeropivotdetected;
2692:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2694:   allowzeropivot = PetscNot(A->erroriffailure);

2699:   if (!a->inode.ibdiagvalid) {
2700:     if (!a->inode.ibdiag) {
2701:       /* calculate space needed for diagonal blocks */
2702:       for (i=0; i<m; i++) {
2703:         cnt += sizes[i]*sizes[i];
2704:       }
2705:       a->inode.bdiagsize = cnt;

2707:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2708:     }

2710:     /* copy over the diagonal blocks and invert them */
2711:     ibdiag = a->inode.ibdiag;
2712:     bdiag  = a->inode.bdiag;
2713:     cnt    = 0;
2714:     for (i=0, row = 0; i<m; i++) {
2715:       for (j=0; j<sizes[i]; j++) {
2716:         for (k=0; k<sizes[i]; k++) {
2717:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2718:         }
2719:       }
2720:       PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);

2722:       switch (sizes[i]) {
2723:       case 1:
2724:         /* Create matrix data structure */
2725:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2726:           if (allowzeropivot) {
2727:             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2728:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2729:             A->factorerror_zeropivot_row   = row;
2730:             PetscInfo(A,"Zero pivot, row %" PetscInt_FMT "\n",row);
2731:           } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %" PetscInt_FMT,row);
2732:         }
2733:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2734:         break;
2735:       case 2:
2736:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2737:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2738:         break;
2739:       case 3:
2740:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2741:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2742:         break;
2743:       case 4:
2744:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2745:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2746:         break;
2747:       case 5:
2748:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2749:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2750:         break;
2751:       default:
2752:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2753:       }
2754:       cnt += sizes[i]*sizes[i];
2755:       row += sizes[i];
2756:     }
2757:     a->inode.ibdiagvalid = PETSC_TRUE;
2758:   }
2759:   ibdiag = a->inode.ibdiag;
2760:   bdiag  = a->inode.bdiag;
2761:   t      = a->inode.ssor_work;

2763:   VecGetArray(xx,&x);
2764:   VecGetArrayRead(bb,&b);
2765:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2766:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2767:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2769:       for (i=0, row=0; i<m; i++) {
2770:         sz  = diag[row] - ii[row];
2771:         v1  = a->a + ii[row];
2772:         idx = a->j + ii[row];

2774:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2775:         switch (sizes[i]) {
2776:         case 1:

2778:           sum1 = b[row];
2779:           for (n = 0; n<sz-1; n+=2) {
2780:             i1    = idx[0];
2781:             i2    = idx[1];
2782:             idx  += 2;
2783:             tmp0  = x[i1];
2784:             tmp1  = x[i2];
2785:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2786:           }

2788:           if (n == sz-1) {
2789:             tmp0  = x[*idx];
2790:             sum1 -= *v1 * tmp0;
2791:           }
2792:           t[row]   = sum1;
2793:           x[row++] = sum1*(*ibdiag++);
2794:           break;
2795:         case 2:
2796:           v2   = a->a + ii[row+1];
2797:           sum1 = b[row];
2798:           sum2 = b[row+1];
2799:           for (n = 0; n<sz-1; n+=2) {
2800:             i1    = idx[0];
2801:             i2    = idx[1];
2802:             idx  += 2;
2803:             tmp0  = x[i1];
2804:             tmp1  = x[i2];
2805:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2806:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2807:           }

2809:           if (n == sz-1) {
2810:             tmp0  = x[*idx];
2811:             sum1 -= v1[0] * tmp0;
2812:             sum2 -= v2[0] * tmp0;
2813:           }
2814:           t[row]   = sum1;
2815:           t[row+1] = sum2;
2816:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2817:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2818:           ibdiag  += 4;
2819:           break;
2820:         case 3:
2821:           v2   = a->a + ii[row+1];
2822:           v3   = a->a + ii[row+2];
2823:           sum1 = b[row];
2824:           sum2 = b[row+1];
2825:           sum3 = b[row+2];
2826:           for (n = 0; n<sz-1; n+=2) {
2827:             i1    = idx[0];
2828:             i2    = idx[1];
2829:             idx  += 2;
2830:             tmp0  = x[i1];
2831:             tmp1  = x[i2];
2832:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2833:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2834:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2835:           }

2837:           if (n == sz-1) {
2838:             tmp0  = x[*idx];
2839:             sum1 -= v1[0] * tmp0;
2840:             sum2 -= v2[0] * tmp0;
2841:             sum3 -= v3[0] * tmp0;
2842:           }
2843:           t[row]   = sum1;
2844:           t[row+1] = sum2;
2845:           t[row+2] = sum3;
2846:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2847:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2848:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2849:           ibdiag  += 9;
2850:           break;
2851:         case 4:
2852:           v2   = a->a + ii[row+1];
2853:           v3   = a->a + ii[row+2];
2854:           v4   = a->a + ii[row+3];
2855:           sum1 = b[row];
2856:           sum2 = b[row+1];
2857:           sum3 = b[row+2];
2858:           sum4 = b[row+3];
2859:           for (n = 0; n<sz-1; n+=2) {
2860:             i1    = idx[0];
2861:             i2    = idx[1];
2862:             idx  += 2;
2863:             tmp0  = x[i1];
2864:             tmp1  = x[i2];
2865:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2866:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2867:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2868:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2869:           }

2871:           if (n == sz-1) {
2872:             tmp0  = x[*idx];
2873:             sum1 -= v1[0] * tmp0;
2874:             sum2 -= v2[0] * tmp0;
2875:             sum3 -= v3[0] * tmp0;
2876:             sum4 -= v4[0] * tmp0;
2877:           }
2878:           t[row]   = sum1;
2879:           t[row+1] = sum2;
2880:           t[row+2] = sum3;
2881:           t[row+3] = sum4;
2882:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2883:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2884:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2885:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2886:           ibdiag  += 16;
2887:           break;
2888:         case 5:
2889:           v2   = a->a + ii[row+1];
2890:           v3   = a->a + ii[row+2];
2891:           v4   = a->a + ii[row+3];
2892:           v5   = a->a + ii[row+4];
2893:           sum1 = b[row];
2894:           sum2 = b[row+1];
2895:           sum3 = b[row+2];
2896:           sum4 = b[row+3];
2897:           sum5 = b[row+4];
2898:           for (n = 0; n<sz-1; n+=2) {
2899:             i1    = idx[0];
2900:             i2    = idx[1];
2901:             idx  += 2;
2902:             tmp0  = x[i1];
2903:             tmp1  = x[i2];
2904:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2905:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2906:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2907:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2908:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2909:           }

2911:           if (n == sz-1) {
2912:             tmp0  = x[*idx];
2913:             sum1 -= v1[0] * tmp0;
2914:             sum2 -= v2[0] * tmp0;
2915:             sum3 -= v3[0] * tmp0;
2916:             sum4 -= v4[0] * tmp0;
2917:             sum5 -= v5[0] * tmp0;
2918:           }
2919:           t[row]   = sum1;
2920:           t[row+1] = sum2;
2921:           t[row+2] = sum3;
2922:           t[row+3] = sum4;
2923:           t[row+4] = sum5;
2924:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2925:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2926:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2927:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2928:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2929:           ibdiag  += 25;
2930:           break;
2931:         default:
2932:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2933:         }
2934:       }

2936:       xb   = t;
2937:       PetscLogFlops(a->nz);
2938:     } else xb = b;
2939:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

2941:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2942:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2943:         ibdiag -= sizes[i]*sizes[i];
2944:         sz      = ii[row+1] - diag[row] - 1;
2945:         v1      = a->a + diag[row] + 1;
2946:         idx     = a->j + diag[row] + 1;

2948:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2949:         switch (sizes[i]) {
2950:         case 1:

2952:           sum1 = xb[row];
2953:           for (n = 0; n<sz-1; n+=2) {
2954:             i1    = idx[0];
2955:             i2    = idx[1];
2956:             idx  += 2;
2957:             tmp0  = x[i1];
2958:             tmp1  = x[i2];
2959:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2960:           }

2962:           if (n == sz-1) {
2963:             tmp0  = x[*idx];
2964:             sum1 -= *v1*tmp0;
2965:           }
2966:           x[row--] = sum1*(*ibdiag);
2967:           break;

2969:         case 2:

2971:           sum1 = xb[row];
2972:           sum2 = xb[row-1];
2973:           /* note that sum1 is associated with the second of the two rows */
2974:           v2 = a->a + diag[row-1] + 2;
2975:           for (n = 0; n<sz-1; n+=2) {
2976:             i1    = idx[0];
2977:             i2    = idx[1];
2978:             idx  += 2;
2979:             tmp0  = x[i1];
2980:             tmp1  = x[i2];
2981:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2982:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2983:           }

2985:           if (n == sz-1) {
2986:             tmp0  = x[*idx];
2987:             sum1 -= *v1*tmp0;
2988:             sum2 -= *v2*tmp0;
2989:           }
2990:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2991:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2992:           break;
2993:         case 3:

2995:           sum1 = xb[row];
2996:           sum2 = xb[row-1];
2997:           sum3 = xb[row-2];
2998:           v2   = a->a + diag[row-1] + 2;
2999:           v3   = a->a + diag[row-2] + 3;
3000:           for (n = 0; n<sz-1; n+=2) {
3001:             i1    = idx[0];
3002:             i2    = idx[1];
3003:             idx  += 2;
3004:             tmp0  = x[i1];
3005:             tmp1  = x[i2];
3006:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3007:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3008:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3009:           }

3011:           if (n == sz-1) {
3012:             tmp0  = x[*idx];
3013:             sum1 -= *v1*tmp0;
3014:             sum2 -= *v2*tmp0;
3015:             sum3 -= *v3*tmp0;
3016:           }
3017:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3018:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3019:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3020:           break;
3021:         case 4:

3023:           sum1 = xb[row];
3024:           sum2 = xb[row-1];
3025:           sum3 = xb[row-2];
3026:           sum4 = xb[row-3];
3027:           v2   = a->a + diag[row-1] + 2;
3028:           v3   = a->a + diag[row-2] + 3;
3029:           v4   = a->a + diag[row-3] + 4;
3030:           for (n = 0; n<sz-1; n+=2) {
3031:             i1    = idx[0];
3032:             i2    = idx[1];
3033:             idx  += 2;
3034:             tmp0  = x[i1];
3035:             tmp1  = x[i2];
3036:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3037:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3038:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3039:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3040:           }

3042:           if (n == sz-1) {
3043:             tmp0  = x[*idx];
3044:             sum1 -= *v1*tmp0;
3045:             sum2 -= *v2*tmp0;
3046:             sum3 -= *v3*tmp0;
3047:             sum4 -= *v4*tmp0;
3048:           }
3049:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3050:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3051:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3052:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3053:           break;
3054:         case 5:

3056:           sum1 = xb[row];
3057:           sum2 = xb[row-1];
3058:           sum3 = xb[row-2];
3059:           sum4 = xb[row-3];
3060:           sum5 = xb[row-4];
3061:           v2   = a->a + diag[row-1] + 2;
3062:           v3   = a->a + diag[row-2] + 3;
3063:           v4   = a->a + diag[row-3] + 4;
3064:           v5   = a->a + diag[row-4] + 5;
3065:           for (n = 0; n<sz-1; n+=2) {
3066:             i1    = idx[0];
3067:             i2    = idx[1];
3068:             idx  += 2;
3069:             tmp0  = x[i1];
3070:             tmp1  = x[i2];
3071:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3072:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3073:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3074:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3075:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3076:           }

3078:           if (n == sz-1) {
3079:             tmp0  = x[*idx];
3080:             sum1 -= *v1*tmp0;
3081:             sum2 -= *v2*tmp0;
3082:             sum3 -= *v3*tmp0;
3083:             sum4 -= *v4*tmp0;
3084:             sum5 -= *v5*tmp0;
3085:           }
3086:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3087:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3088:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3089:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3090:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3091:           break;
3092:         default:
3093:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3094:         }
3095:       }

3097:       PetscLogFlops(a->nz);
3098:     }
3099:     its--;
3100:   }
3101:   while (its--) {

3103:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3104:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3105:            i<m;
3106:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3108:         sz  = diag[row] - ii[row];
3109:         v1  = a->a + ii[row];
3110:         idx = a->j + ii[row];
3111:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3112:         switch (sizes[i]) {
3113:         case 1:
3114:           sum1 = b[row];
3115:           for (n = 0; n<sz-1; n+=2) {
3116:             i1    = idx[0];
3117:             i2    = idx[1];
3118:             idx  += 2;
3119:             tmp0  = x[i1];
3120:             tmp1  = x[i2];
3121:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3122:           }
3123:           if (n == sz-1) {
3124:             tmp0  = x[*idx++];
3125:             sum1 -= *v1 * tmp0;
3126:             v1++;
3127:           }
3128:           t[row]   = sum1;
3129:           sz      = ii[row+1] - diag[row] - 1;
3130:           idx     = a->j + diag[row] + 1;
3131:           v1 += 1;
3132:           for (n = 0; n<sz-1; n+=2) {
3133:             i1    = idx[0];
3134:             i2    = idx[1];
3135:             idx  += 2;
3136:             tmp0  = x[i1];
3137:             tmp1  = x[i2];
3138:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3139:           }
3140:           if (n == sz-1) {
3141:             tmp0  = x[*idx++];
3142:             sum1 -= *v1 * tmp0;
3143:           }
3144:           /* in MatSOR_SeqAIJ this line would be
3145:            *
3146:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3147:            *
3148:            * but omega == 1, so this becomes
3149:            *
3150:            * x[row] = sum1*(*ibdiag++);
3151:            *
3152:            */
3153:           x[row] = sum1*(*ibdiag);
3154:           break;
3155:         case 2:
3156:           v2   = a->a + ii[row+1];
3157:           sum1 = b[row];
3158:           sum2 = b[row+1];
3159:           for (n = 0; n<sz-1; n+=2) {
3160:             i1    = idx[0];
3161:             i2    = idx[1];
3162:             idx  += 2;
3163:             tmp0  = x[i1];
3164:             tmp1  = x[i2];
3165:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3166:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3167:           }
3168:           if (n == sz-1) {
3169:             tmp0  = x[*idx++];
3170:             sum1 -= v1[0] * tmp0;
3171:             sum2 -= v2[0] * tmp0;
3172:             v1++; v2++;
3173:           }
3174:           t[row]   = sum1;
3175:           t[row+1] = sum2;
3176:           sz      = ii[row+1] - diag[row] - 2;
3177:           idx     = a->j + diag[row] + 2;
3178:           v1 += 2;
3179:           v2 += 2;
3180:           for (n = 0; n<sz-1; n+=2) {
3181:             i1    = idx[0];
3182:             i2    = idx[1];
3183:             idx  += 2;
3184:             tmp0  = x[i1];
3185:             tmp1  = x[i2];
3186:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3187:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3188:           }
3189:           if (n == sz-1) {
3190:             tmp0  = x[*idx];
3191:             sum1 -= v1[0] * tmp0;
3192:             sum2 -= v2[0] * tmp0;
3193:           }
3194:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3195:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3196:           break;
3197:         case 3:
3198:           v2   = a->a + ii[row+1];
3199:           v3   = a->a + ii[row+2];
3200:           sum1 = b[row];
3201:           sum2 = b[row+1];
3202:           sum3 = b[row+2];
3203:           for (n = 0; n<sz-1; n+=2) {
3204:             i1    = idx[0];
3205:             i2    = idx[1];
3206:             idx  += 2;
3207:             tmp0  = x[i1];
3208:             tmp1  = x[i2];
3209:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3210:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3211:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3212:           }
3213:           if (n == sz-1) {
3214:             tmp0  = x[*idx++];
3215:             sum1 -= v1[0] * tmp0;
3216:             sum2 -= v2[0] * tmp0;
3217:             sum3 -= v3[0] * tmp0;
3218:             v1++; v2++; v3++;
3219:           }
3220:           t[row]   = sum1;
3221:           t[row+1] = sum2;
3222:           t[row+2] = sum3;
3223:           sz      = ii[row+1] - diag[row] - 3;
3224:           idx     = a->j + diag[row] + 3;
3225:           v1 += 3;
3226:           v2 += 3;
3227:           v3 += 3;
3228:           for (n = 0; n<sz-1; n+=2) {
3229:             i1    = idx[0];
3230:             i2    = idx[1];
3231:             idx  += 2;
3232:             tmp0  = x[i1];
3233:             tmp1  = x[i2];
3234:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3235:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3236:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3237:           }
3238:           if (n == sz-1) {
3239:             tmp0  = x[*idx];
3240:             sum1 -= v1[0] * tmp0;
3241:             sum2 -= v2[0] * tmp0;
3242:             sum3 -= v3[0] * tmp0;
3243:           }
3244:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3245:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3246:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3247:           break;
3248:         case 4:
3249:           v2   = a->a + ii[row+1];
3250:           v3   = a->a + ii[row+2];
3251:           v4   = a->a + ii[row+3];
3252:           sum1 = b[row];
3253:           sum2 = b[row+1];
3254:           sum3 = b[row+2];
3255:           sum4 = b[row+3];
3256:           for (n = 0; n<sz-1; n+=2) {
3257:             i1    = idx[0];
3258:             i2    = idx[1];
3259:             idx  += 2;
3260:             tmp0  = x[i1];
3261:             tmp1  = x[i2];
3262:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3263:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3264:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3265:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3266:           }
3267:           if (n == sz-1) {
3268:             tmp0  = x[*idx++];
3269:             sum1 -= v1[0] * tmp0;
3270:             sum2 -= v2[0] * tmp0;
3271:             sum3 -= v3[0] * tmp0;
3272:             sum4 -= v4[0] * tmp0;
3273:             v1++; v2++; v3++; v4++;
3274:           }
3275:           t[row]   = sum1;
3276:           t[row+1] = sum2;
3277:           t[row+2] = sum3;
3278:           t[row+3] = sum4;
3279:           sz      = ii[row+1] - diag[row] - 4;
3280:           idx     = a->j + diag[row] + 4;
3281:           v1 += 4;
3282:           v2 += 4;
3283:           v3 += 4;
3284:           v4 += 4;
3285:           for (n = 0; n<sz-1; n+=2) {
3286:             i1    = idx[0];
3287:             i2    = idx[1];
3288:             idx  += 2;
3289:             tmp0  = x[i1];
3290:             tmp1  = x[i2];
3291:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3292:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3293:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3294:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3295:           }
3296:           if (n == sz-1) {
3297:             tmp0  = x[*idx];
3298:             sum1 -= v1[0] * tmp0;
3299:             sum2 -= v2[0] * tmp0;
3300:             sum3 -= v3[0] * tmp0;
3301:             sum4 -= v4[0] * tmp0;
3302:           }
3303:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3304:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3305:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3306:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3307:           break;
3308:         case 5:
3309:           v2   = a->a + ii[row+1];
3310:           v3   = a->a + ii[row+2];
3311:           v4   = a->a + ii[row+3];
3312:           v5   = a->a + ii[row+4];
3313:           sum1 = b[row];
3314:           sum2 = b[row+1];
3315:           sum3 = b[row+2];
3316:           sum4 = b[row+3];
3317:           sum5 = b[row+4];
3318:           for (n = 0; n<sz-1; n+=2) {
3319:             i1    = idx[0];
3320:             i2    = idx[1];
3321:             idx  += 2;
3322:             tmp0  = x[i1];
3323:             tmp1  = x[i2];
3324:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3325:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3326:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3327:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3328:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3329:           }
3330:           if (n == sz-1) {
3331:             tmp0  = x[*idx++];
3332:             sum1 -= v1[0] * tmp0;
3333:             sum2 -= v2[0] * tmp0;
3334:             sum3 -= v3[0] * tmp0;
3335:             sum4 -= v4[0] * tmp0;
3336:             sum5 -= v5[0] * tmp0;
3337:             v1++; v2++; v3++; v4++; v5++;
3338:           }
3339:           t[row]   = sum1;
3340:           t[row+1] = sum2;
3341:           t[row+2] = sum3;
3342:           t[row+3] = sum4;
3343:           t[row+4] = sum5;
3344:           sz      = ii[row+1] - diag[row] - 5;
3345:           idx     = a->j + diag[row] + 5;
3346:           v1 += 5;
3347:           v2 += 5;
3348:           v3 += 5;
3349:           v4 += 5;
3350:           v5 += 5;
3351:           for (n = 0; n<sz-1; n+=2) {
3352:             i1    = idx[0];
3353:             i2    = idx[1];
3354:             idx  += 2;
3355:             tmp0  = x[i1];
3356:             tmp1  = x[i2];
3357:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3358:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3359:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3360:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3361:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3362:           }
3363:           if (n == sz-1) {
3364:             tmp0  = x[*idx];
3365:             sum1 -= v1[0] * tmp0;
3366:             sum2 -= v2[0] * tmp0;
3367:             sum3 -= v3[0] * tmp0;
3368:             sum4 -= v4[0] * tmp0;
3369:             sum5 -= v5[0] * tmp0;
3370:           }
3371:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3372:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3373:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3374:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3375:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3376:           break;
3377:         default:
3378:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3379:         }
3380:       }
3381:       xb   = t;
3382:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3383:     } else xb = b;

3385:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3387:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3388:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3389:         ibdiag -= sizes[i]*sizes[i];

3391:         /* set RHS */
3392:         if (xb == b) {
3393:           /* whole (old way) */
3394:           sz      = ii[row+1] - ii[row];
3395:           idx     = a->j + ii[row];
3396:           switch (sizes[i]) {
3397:           case 5:
3398:             v5      = a->a + ii[row-4];
3399:           case 4: /* fall through */
3400:             v4      = a->a + ii[row-3];
3401:           case 3:
3402:             v3      = a->a + ii[row-2];
3403:           case 2:
3404:             v2      = a->a + ii[row-1];
3405:           case 1:
3406:             v1      = a->a + ii[row];
3407:             break;
3408:           default:
3409:             SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3410:           }
3411:         } else {
3412:           /* upper, no diag */
3413:           sz      = ii[row+1] - diag[row] - 1;
3414:           idx     = a->j + diag[row] + 1;
3415:           switch (sizes[i]) {
3416:           case 5:
3417:             v5      = a->a + diag[row-4] + 5;
3418:           case 4: /* fall through */
3419:             v4      = a->a + diag[row-3] + 4;
3420:           case 3:
3421:             v3      = a->a + diag[row-2] + 3;
3422:           case 2:
3423:             v2      = a->a + diag[row-1] + 2;
3424:           case 1:
3425:             v1      = a->a + diag[row] + 1;
3426:           }
3427:         }
3428:         /* set sum */
3429:         switch (sizes[i]) {
3430:         case 5:
3431:           sum5 = xb[row-4];
3432:         case 4: /* fall through */
3433:           sum4 = xb[row-3];
3434:         case 3:
3435:           sum3 = xb[row-2];
3436:         case 2:
3437:           sum2 = xb[row-1];
3438:         case 1:
3439:           /* note that sum1 is associated with the last row */
3440:           sum1 = xb[row];
3441:         }
3442:         /* do sums */
3443:         for (n = 0; n<sz-1; n+=2) {
3444:             i1    = idx[0];
3445:             i2    = idx[1];
3446:             idx  += 2;
3447:             tmp0  = x[i1];
3448:             tmp1  = x[i2];
3449:             switch (sizes[i]) {
3450:             case 5:
3451:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3452:             case 4: /* fall through */
3453:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3454:             case 3:
3455:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3456:             case 2:
3457:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3458:             case 1:
3459:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3460:             }
3461:         }
3462:         /* ragged edge */
3463:         if (n == sz-1) {
3464:           tmp0  = x[*idx];
3465:           switch (sizes[i]) {
3466:           case 5:
3467:             sum5 -= *v5*tmp0;
3468:           case 4: /* fall through */
3469:             sum4 -= *v4*tmp0;
3470:           case 3:
3471:             sum3 -= *v3*tmp0;
3472:           case 2:
3473:             sum2 -= *v2*tmp0;
3474:           case 1:
3475:             sum1 -= *v1*tmp0;
3476:           }
3477:         }
3478:         /* update */
3479:         if (xb == b) {
3480:           /* whole (old way) w/ diag */
3481:           switch (sizes[i]) {
3482:           case 5:
3483:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3484:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3485:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3486:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3487:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3488:             break;
3489:           case 4:
3490:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3491:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3492:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3493:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3494:             break;
3495:           case 3:
3496:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3497:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3498:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3499:             break;
3500:           case 2:
3501:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3502:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3503:             break;
3504:           case 1:
3505:             x[row--] += sum1*(*ibdiag);
3506:             break;
3507:           }
3508:         } else {
3509:           /* no diag so set =  */
3510:           switch (sizes[i]) {
3511:           case 5:
3512:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3513:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3514:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3515:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3516:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3517:             break;
3518:           case 4:
3519:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3520:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3521:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3522:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3523:             break;
3524:           case 3:
3525:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3526:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3527:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3528:             break;
3529:           case 2:
3530:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3531:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3532:             break;
3533:           case 1:
3534:             x[row--] = sum1*(*ibdiag);
3535:             break;
3536:           }
3537:         }
3538:       }
3539:       if (xb == b) {
3540:         PetscLogFlops(2.0*a->nz);
3541:       } else {
3542:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3543:       }
3544:     }
3545:   }
3546:   if (flag & SOR_EISENSTAT) {
3547:     /*
3548:           Apply  (U + D)^-1  where D is now the block diagonal
3549:     */
3550:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3551:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3552:       ibdiag -= sizes[i]*sizes[i];
3553:       sz      = ii[row+1] - diag[row] - 1;
3554:       v1      = a->a + diag[row] + 1;
3555:       idx     = a->j + diag[row] + 1;
3556:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3557:       switch (sizes[i]) {
3558:       case 1:

3560:         sum1 = b[row];
3561:         for (n = 0; n<sz-1; n+=2) {
3562:           i1    = idx[0];
3563:           i2    = idx[1];
3564:           idx  += 2;
3565:           tmp0  = x[i1];
3566:           tmp1  = x[i2];
3567:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3568:         }

3570:         if (n == sz-1) {
3571:           tmp0  = x[*idx];
3572:           sum1 -= *v1*tmp0;
3573:         }
3574:         x[row] = sum1*(*ibdiag);row--;
3575:         break;

3577:       case 2:

3579:         sum1 = b[row];
3580:         sum2 = b[row-1];
3581:         /* note that sum1 is associated with the second of the two rows */
3582:         v2 = a->a + diag[row-1] + 2;
3583:         for (n = 0; n<sz-1; n+=2) {
3584:           i1    = idx[0];
3585:           i2    = idx[1];
3586:           idx  += 2;
3587:           tmp0  = x[i1];
3588:           tmp1  = x[i2];
3589:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3590:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3591:         }

3593:         if (n == sz-1) {
3594:           tmp0  = x[*idx];
3595:           sum1 -= *v1*tmp0;
3596:           sum2 -= *v2*tmp0;
3597:         }
3598:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3599:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3600:         row     -= 2;
3601:         break;
3602:       case 3:

3604:         sum1 = b[row];
3605:         sum2 = b[row-1];
3606:         sum3 = b[row-2];
3607:         v2   = a->a + diag[row-1] + 2;
3608:         v3   = a->a + diag[row-2] + 3;
3609:         for (n = 0; n<sz-1; n+=2) {
3610:           i1    = idx[0];
3611:           i2    = idx[1];
3612:           idx  += 2;
3613:           tmp0  = x[i1];
3614:           tmp1  = x[i2];
3615:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3616:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3617:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3618:         }

3620:         if (n == sz-1) {
3621:           tmp0  = x[*idx];
3622:           sum1 -= *v1*tmp0;
3623:           sum2 -= *v2*tmp0;
3624:           sum3 -= *v3*tmp0;
3625:         }
3626:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3627:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3628:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3629:         row     -= 3;
3630:         break;
3631:       case 4:

3633:         sum1 = b[row];
3634:         sum2 = b[row-1];
3635:         sum3 = b[row-2];
3636:         sum4 = b[row-3];
3637:         v2   = a->a + diag[row-1] + 2;
3638:         v3   = a->a + diag[row-2] + 3;
3639:         v4   = a->a + diag[row-3] + 4;
3640:         for (n = 0; n<sz-1; n+=2) {
3641:           i1    = idx[0];
3642:           i2    = idx[1];
3643:           idx  += 2;
3644:           tmp0  = x[i1];
3645:           tmp1  = x[i2];
3646:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3647:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3648:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3649:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3650:         }

3652:         if (n == sz-1) {
3653:           tmp0  = x[*idx];
3654:           sum1 -= *v1*tmp0;
3655:           sum2 -= *v2*tmp0;
3656:           sum3 -= *v3*tmp0;
3657:           sum4 -= *v4*tmp0;
3658:         }
3659:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3660:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3661:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3662:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3663:         row     -= 4;
3664:         break;
3665:       case 5:

3667:         sum1 = b[row];
3668:         sum2 = b[row-1];
3669:         sum3 = b[row-2];
3670:         sum4 = b[row-3];
3671:         sum5 = b[row-4];
3672:         v2   = a->a + diag[row-1] + 2;
3673:         v3   = a->a + diag[row-2] + 3;
3674:         v4   = a->a + diag[row-3] + 4;
3675:         v5   = a->a + diag[row-4] + 5;
3676:         for (n = 0; n<sz-1; n+=2) {
3677:           i1    = idx[0];
3678:           i2    = idx[1];
3679:           idx  += 2;
3680:           tmp0  = x[i1];
3681:           tmp1  = x[i2];
3682:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3683:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3684:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3685:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3686:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3687:         }

3689:         if (n == sz-1) {
3690:           tmp0  = x[*idx];
3691:           sum1 -= *v1*tmp0;
3692:           sum2 -= *v2*tmp0;
3693:           sum3 -= *v3*tmp0;
3694:           sum4 -= *v4*tmp0;
3695:           sum5 -= *v5*tmp0;
3696:         }
3697:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3698:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3699:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3700:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3701:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3702:         row     -= 5;
3703:         break;
3704:       default:
3705:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3706:       }
3707:     }
3708:     PetscLogFlops(a->nz);

3710:     /*
3711:            t = b - D x    where D is the block diagonal
3712:     */
3713:     cnt = 0;
3714:     for (i=0, row=0; i<m; i++) {
3715:       switch (sizes[i]) {
3716:       case 1:
3717:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3718:         break;
3719:       case 2:
3720:         x1       = x[row]; x2 = x[row+1];
3721:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3722:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3723:         t[row]   = b[row] - tmp1;
3724:         t[row+1] = b[row+1] - tmp2; row += 2;
3725:         cnt     += 4;
3726:         break;
3727:       case 3:
3728:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3729:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3730:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3731:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3732:         t[row]   = b[row] - tmp1;
3733:         t[row+1] = b[row+1] - tmp2;
3734:         t[row+2] = b[row+2] - tmp3; row += 3;
3735:         cnt     += 9;
3736:         break;
3737:       case 4:
3738:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3739:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3740:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3741:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3742:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3743:         t[row]   = b[row] - tmp1;
3744:         t[row+1] = b[row+1] - tmp2;
3745:         t[row+2] = b[row+2] - tmp3;
3746:         t[row+3] = b[row+3] - tmp4; row += 4;
3747:         cnt     += 16;
3748:         break;
3749:       case 5:
3750:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3751:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3752:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3753:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3754:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3755:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3756:         t[row]   = b[row] - tmp1;
3757:         t[row+1] = b[row+1] - tmp2;
3758:         t[row+2] = b[row+2] - tmp3;
3759:         t[row+3] = b[row+3] - tmp4;
3760:         t[row+4] = b[row+4] - tmp5;row += 5;
3761:         cnt     += 25;
3762:         break;
3763:       default:
3764:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3765:       }
3766:     }
3767:     PetscLogFlops(m);

3769:     /*
3770:           Apply (L + D)^-1 where D is the block diagonal
3771:     */
3772:     for (i=0, row=0; i<m; i++) {
3773:       sz  = diag[row] - ii[row];
3774:       v1  = a->a + ii[row];
3775:       idx = a->j + ii[row];
3776:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3777:       switch (sizes[i]) {
3778:       case 1:

3780:         sum1 = t[row];
3781:         for (n = 0; n<sz-1; n+=2) {
3782:           i1    = idx[0];
3783:           i2    = idx[1];
3784:           idx  += 2;
3785:           tmp0  = t[i1];
3786:           tmp1  = t[i2];
3787:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3788:         }

3790:         if (n == sz-1) {
3791:           tmp0  = t[*idx];
3792:           sum1 -= *v1 * tmp0;
3793:         }
3794:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3795:         break;
3796:       case 2:
3797:         v2   = a->a + ii[row+1];
3798:         sum1 = t[row];
3799:         sum2 = t[row+1];
3800:         for (n = 0; n<sz-1; n+=2) {
3801:           i1    = idx[0];
3802:           i2    = idx[1];
3803:           idx  += 2;
3804:           tmp0  = t[i1];
3805:           tmp1  = t[i2];
3806:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3807:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3808:         }

3810:         if (n == sz-1) {
3811:           tmp0  = t[*idx];
3812:           sum1 -= v1[0] * tmp0;
3813:           sum2 -= v2[0] * tmp0;
3814:         }
3815:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3816:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3817:         ibdiag   += 4; row += 2;
3818:         break;
3819:       case 3:
3820:         v2   = a->a + ii[row+1];
3821:         v3   = a->a + ii[row+2];
3822:         sum1 = t[row];
3823:         sum2 = t[row+1];
3824:         sum3 = t[row+2];
3825:         for (n = 0; n<sz-1; n+=2) {
3826:           i1    = idx[0];
3827:           i2    = idx[1];
3828:           idx  += 2;
3829:           tmp0  = t[i1];
3830:           tmp1  = t[i2];
3831:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3832:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3833:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3834:         }

3836:         if (n == sz-1) {
3837:           tmp0  = t[*idx];
3838:           sum1 -= v1[0] * tmp0;
3839:           sum2 -= v2[0] * tmp0;
3840:           sum3 -= v3[0] * tmp0;
3841:         }
3842:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3843:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3844:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3845:         ibdiag   += 9; row += 3;
3846:         break;
3847:       case 4:
3848:         v2   = a->a + ii[row+1];
3849:         v3   = a->a + ii[row+2];
3850:         v4   = a->a + ii[row+3];
3851:         sum1 = t[row];
3852:         sum2 = t[row+1];
3853:         sum3 = t[row+2];
3854:         sum4 = t[row+3];
3855:         for (n = 0; n<sz-1; n+=2) {
3856:           i1    = idx[0];
3857:           i2    = idx[1];
3858:           idx  += 2;
3859:           tmp0  = t[i1];
3860:           tmp1  = t[i2];
3861:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3862:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3863:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3864:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3865:         }

3867:         if (n == sz-1) {
3868:           tmp0  = t[*idx];
3869:           sum1 -= v1[0] * tmp0;
3870:           sum2 -= v2[0] * tmp0;
3871:           sum3 -= v3[0] * tmp0;
3872:           sum4 -= v4[0] * tmp0;
3873:         }
3874:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3875:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3876:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3877:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3878:         ibdiag   += 16; row += 4;
3879:         break;
3880:       case 5:
3881:         v2   = a->a + ii[row+1];
3882:         v3   = a->a + ii[row+2];
3883:         v4   = a->a + ii[row+3];
3884:         v5   = a->a + ii[row+4];
3885:         sum1 = t[row];
3886:         sum2 = t[row+1];
3887:         sum3 = t[row+2];
3888:         sum4 = t[row+3];
3889:         sum5 = t[row+4];
3890:         for (n = 0; n<sz-1; n+=2) {
3891:           i1    = idx[0];
3892:           i2    = idx[1];
3893:           idx  += 2;
3894:           tmp0  = t[i1];
3895:           tmp1  = t[i2];
3896:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3897:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3898:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3899:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3900:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3901:         }

3903:         if (n == sz-1) {
3904:           tmp0  = t[*idx];
3905:           sum1 -= v1[0] * tmp0;
3906:           sum2 -= v2[0] * tmp0;
3907:           sum3 -= v3[0] * tmp0;
3908:           sum4 -= v4[0] * tmp0;
3909:           sum5 -= v5[0] * tmp0;
3910:         }
3911:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3912:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3913:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3914:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3915:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3916:         ibdiag   += 25; row += 5;
3917:         break;
3918:       default:
3919:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3920:       }
3921:     }
3922:     PetscLogFlops(a->nz);
3923:   }
3924:   VecRestoreArray(xx,&x);
3925:   VecRestoreArrayRead(bb,&b);
3926:   return 0;
3927: }

3929: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3930: {
3931:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3932:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3933:   const MatScalar   *bdiag = a->inode.bdiag;
3934:   const PetscScalar *b;
3935:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3936:   const PetscInt    *sizes = a->inode.size;

3939:   VecGetArray(xx,&x);
3940:   VecGetArrayRead(bb,&b);
3941:   cnt  = 0;
3942:   for (i=0, row=0; i<m; i++) {
3943:     switch (sizes[i]) {
3944:     case 1:
3945:       x[row] = b[row]*bdiag[cnt++];row++;
3946:       break;
3947:     case 2:
3948:       x1       = b[row]; x2 = b[row+1];
3949:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3950:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3951:       x[row++] = tmp1;
3952:       x[row++] = tmp2;
3953:       cnt     += 4;
3954:       break;
3955:     case 3:
3956:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3957:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3958:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3959:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3960:       x[row++] = tmp1;
3961:       x[row++] = tmp2;
3962:       x[row++] = tmp3;
3963:       cnt     += 9;
3964:       break;
3965:     case 4:
3966:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3967:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3968:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3969:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3970:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3971:       x[row++] = tmp1;
3972:       x[row++] = tmp2;
3973:       x[row++] = tmp3;
3974:       x[row++] = tmp4;
3975:       cnt     += 16;
3976:       break;
3977:     case 5:
3978:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3979:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3980:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3981:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3982:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3983:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3984:       x[row++] = tmp1;
3985:       x[row++] = tmp2;
3986:       x[row++] = tmp3;
3987:       x[row++] = tmp4;
3988:       x[row++] = tmp5;
3989:       cnt     += 25;
3990:       break;
3991:     default:
3992:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3993:     }
3994:   }
3995:   PetscLogFlops(2.0*cnt);
3996:   VecRestoreArray(xx,&x);
3997:   VecRestoreArrayRead(bb,&b);
3998:   return 0;
3999: }

4001: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4002: {
4003:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4005:   a->inode.node_count       = 0;
4006:   a->inode.use              = PETSC_FALSE;
4007:   a->inode.checked          = PETSC_FALSE;
4008:   a->inode.mat_nonzerostate = -1;
4009:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4010:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4011:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4012:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4013:   A->ops->coloringpatch     = NULL;
4014:   A->ops->multdiagonalblock = NULL;
4015:   if (A->factortype) {
4016:     A->ops->solve = MatSolve_SeqAIJ_inplace;
4017:   }
4018:   return 0;
4019: }

4021: /*
4022:     samestructure indicates that the matrix has not changed its nonzero structure so we
4023:     do not need to recompute the inodes
4024: */
4025: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4026: {
4027:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4028:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4029:   PetscBool      flag;
4030:   const PetscInt *idx,*idy,*ii;

4032:   if (!a->inode.use) {
4033:     MatSeqAIJ_Inode_ResetOps(A);
4034:     PetscFree(a->inode.size);
4035:     return 0;
4036:   }
4037:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return 0;

4039:   m = A->rmap->n;
4040:   if (!a->inode.size) PetscMalloc1(m+1,&a->inode.size);
4041:   ns = a->inode.size;

4043:   i          = 0;
4044:   node_count = 0;
4045:   idx        = a->j;
4046:   ii         = a->i;
4047:   while (i < m) {                /* For each row */
4048:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4049:     /* Limits the number of elements in a node to 'a->inode.limit' */
4050:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4051:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4052:       if (nzy != nzx) break;
4053:       idy += nzx;              /* Same nonzero pattern */
4054:       PetscArraycmp(idx,idy,nzx,&flag);
4055:       if (!flag) break;
4056:     }
4057:     ns[node_count++] = blk_size;
4058:     idx             += blk_size*nzx;
4059:     i                = j;
4060:   }

4062:   /* If not enough inodes found,, do not use inode version of the routines */
4063:   if (!m || node_count > .8*m) {
4064:     MatSeqAIJ_Inode_ResetOps(A);
4065:     PetscFree(a->inode.size);
4066:     PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m);
4067:   } else {
4068:     if (!A->factortype) {
4069:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4070:       if (A->rmap->n == A->cmap->n) {
4071:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4072:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4073:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4074:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4075:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4076:       }
4077:     } else {
4078:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4079:     }
4080:     a->inode.node_count = node_count;
4081:     PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit);
4082:   }
4083:   a->inode.checked          = PETSC_TRUE;
4084:   a->inode.mat_nonzerostate = A->nonzerostate;
4085:   return 0;
4086: }

4088: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4089: {
4090:   Mat            B =*C;
4091:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4092:   PetscInt       m=A->rmap->n;

4094:   c->inode.use              = a->inode.use;
4095:   c->inode.limit            = a->inode.limit;
4096:   c->inode.max_limit        = a->inode.max_limit;
4097:   c->inode.checked          = PETSC_FALSE;
4098:   c->inode.size             = NULL;
4099:   c->inode.node_count       = 0;
4100:   c->inode.ibdiagvalid      = PETSC_FALSE;
4101:   c->inode.ibdiag           = NULL;
4102:   c->inode.bdiag            = NULL;
4103:   c->inode.mat_nonzerostate = -1;
4104:   if (a->inode.use) {
4105:     if (a->inode.checked && a->inode.size) {
4106:       PetscMalloc1(m+1,&c->inode.size);
4107:       PetscArraycpy(c->inode.size,a->inode.size,m+1);

4109:       c->inode.checked          = PETSC_TRUE;
4110:       c->inode.node_count       = a->inode.node_count;
4111:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4112:     }
4113:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4114:     if (!B->factortype) {
4115:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4116:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4117:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4118:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4119:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4120:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4121:     } else {
4122:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4123:     }
4124:   }
4125:   return 0;
4126: }

4128: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4129: {
4130:   PetscInt       k;
4131:   const PetscInt *vi;

4133:   vi = aj + ai[row];
4134:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4135:   vi        = aj + adiag[row];
4136:   cols[nzl] = vi[0];
4137:   vi        = aj + adiag[row+1]+1;
4138:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4139:   return 0;
4140: }
4141: /*
4142:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4143:    Modified from MatSeqAIJCheckInode().

4145:    Input Parameters:
4146: .  Mat A - ILU or LU matrix factor

4148: */
4149: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4150: {
4151:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4152:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4153:   PetscInt       *cols1,*cols2,*ns;
4154:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4155:   PetscBool      flag;

4157:   if (!a->inode.use)    return 0;
4158:   if (a->inode.checked) return 0;

4160:   m = A->rmap->n;
4161:   if (a->inode.size) ns = a->inode.size;
4162:   else {
4163:     PetscMalloc1(m+1,&ns);
4164:   }

4166:   i          = 0;
4167:   node_count = 0;
4168:   PetscMalloc2(m,&cols1,m,&cols2);
4169:   while (i < m) {                /* For each row */
4170:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4171:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4172:     nzx  = nzl1 + nzu1 + 1;
4173:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4175:     /* Limits the number of elements in a node to 'a->inode.limit' */
4176:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4177:       nzl2 = ai[j+1] - ai[j];
4178:       nzu2 = adiag[j] - adiag[j+1] - 1;
4179:       nzy  = nzl2 + nzu2 + 1;
4180:       if (nzy != nzx) break;
4181:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4182:       PetscArraycmp(cols1,cols2,nzx,&flag);
4183:       if (!flag) break;
4184:     }
4185:     ns[node_count++] = blk_size;
4186:     i                = j;
4187:   }
4188:   PetscFree2(cols1,cols2);
4189:   /* If not enough inodes found,, do not use inode version of the routines */
4190:   if (!m || node_count > .8*m) {
4191:     PetscFree(ns);

4193:     a->inode.node_count = 0;
4194:     a->inode.size       = NULL;
4195:     a->inode.use        = PETSC_FALSE;

4197:     PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m);
4198:   } else {
4199:     A->ops->mult              = NULL;
4200:     A->ops->sor               = NULL;
4201:     A->ops->multadd           = NULL;
4202:     A->ops->getrowij          = NULL;
4203:     A->ops->restorerowij      = NULL;
4204:     A->ops->getcolumnij       = NULL;
4205:     A->ops->restorecolumnij   = NULL;
4206:     A->ops->coloringpatch     = NULL;
4207:     A->ops->multdiagonalblock = NULL;
4208:     a->inode.node_count       = node_count;
4209:     a->inode.size             = ns;

4211:     PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit);
4212:   }
4213:   a->inode.checked = PETSC_TRUE;
4214:   return 0;
4215: }

4217: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4218: {
4219:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4221:   a->inode.ibdiagvalid = PETSC_FALSE;
4222:   return 0;
4223: }

4225: /*
4226:      This is really ugly. if inodes are used this replaces the
4227:   permutations with ones that correspond to rows/cols of the matrix
4228:   rather then inode blocks
4229: */
4230: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4231: {
4232:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4233:   return 0;
4234: }

4236: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4237: {
4238:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4239:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4240:   const PetscInt *ridx,*cidx;
4241:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4242:   PetscInt       nslim_col,*ns_col;
4243:   IS             ris = *rperm,cis = *cperm;

4245:   if (!a->inode.size) return 0; /* no inodes so return */
4246:   if (a->inode.node_count == m) return 0; /* all inodes are of size 1 */

4248:   MatCreateColInode_Private(A,&nslim_col,&ns_col);
4249:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4250:   PetscMalloc2(m,&permr,n,&permc);

4252:   ISGetIndices(ris,&ridx);
4253:   ISGetIndices(cis,&cidx);

4255:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4256:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4258:   /* Construct the permutations for rows*/
4259:   for (i=0,row = 0; i<nslim_row; ++i) {
4260:     indx      = ridx[i];
4261:     start_val = tns[indx];
4262:     end_val   = tns[indx + 1];
4263:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4264:   }

4266:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4267:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4269:   /* Construct permutations for columns */
4270:   for (i=0,col=0; i<nslim_col; ++i) {
4271:     indx      = cidx[i];
4272:     start_val = tns[indx];
4273:     end_val   = tns[indx + 1];
4274:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4275:   }

4277:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4278:   ISSetPermutation(*rperm);
4279:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4280:   ISSetPermutation(*cperm);

4282:   ISRestoreIndices(ris,&ridx);
4283:   ISRestoreIndices(cis,&cidx);

4285:   PetscFree(ns_col);
4286:   PetscFree2(permr,permc);
4287:   ISDestroy(&cis);
4288:   ISDestroy(&ris);
4289:   PetscFree(tns);
4290:   return 0;
4291: }

4293: /*@C
4294:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4296:    Not Collective

4298:    Input Parameter:
4299: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4301:    Output Parameters:
4302: +  node_count - no of inodes present in the matrix.
4303: .  sizes      - an array of size node_count,with sizes of each inode.
4304: -  limit      - the max size used to generate the inodes.

4306:    Level: advanced

4308:    Notes:
4309:     This routine returns some internal storage information
4310:    of the matrix, it is intended to be used by advanced users.
4311:    It should be called after the matrix is assembled.
4312:    The contents of the sizes[] array should not be changed.
4313:    NULL may be passed for information not requested.

4315: .seealso: MatGetInfo()
4316: @*/
4317: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4318: {
4319:   PetscErrorCode (*f)(Mat,PetscInt*,PetscInt**,PetscInt*);

4322:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4323:   if (f) (*f)(A,node_count,sizes,limit);
4324:   return 0;
4325: }

4327: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4328: {
4329:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4331:   if (node_count) *node_count = a->inode.node_count;
4332:   if (sizes)      *sizes      = a->inode.size;
4333:   if (limit)      *limit      = a->inode.limit;
4334:   return 0;
4335: }