Actual source code: inode.c
petsc-3.4.5 2014-06-29
  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

 10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 14:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 17:   n      = A->cmap->n;
 18:   m      = A->rmap->n;
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 24:     for (; count+1 < n; count++,i++) ;
 25:     if (count < n)  {
 26:       i++;
 27:     }
 28:     *size = i;
 29:     return(0);
 30:   }
 31:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);

 33:   /* Use the same row structure wherever feasible. */
 34:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 35:     ns_col[i] = ns_row[i];
 36:   }

 38:   /* if m < n; pad up the remainder with inode_limit */
 39:   for (; count+1 < n; count++,i++) {
 40:     ns_col[i] = 1;
 41:   }
 42:   /* The last node is the odd ball. padd it up with the remaining rows; */
 43:   if (count < n) {
 44:     ns_col[i] = n - count;
 45:     i++;
 46:   } else if (count > n) {
 47:     /* Adjust for the over estimation */
 48:     ns_col[i-1] += n - count;
 49:   }
 50:   *size = i;
 51:   *ns   = ns_col;
 52:   return(0);
 53: }


 56: /*
 57:       This builds symmetric version of nonzero structure,
 58: */
 61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 62: {
 63:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 65:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 66:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 69:   nslim_row = a->inode.node_count;
 70:   m         = A->rmap->n;
 71:   n         = A->cmap->n;
 72:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");

 74:   /* Use the row_inode as column_inode */
 75:   nslim_col = nslim_row;
 76:   ns_col    = ns_row;

 78:   /* allocate space for reformated inode structure */
 79:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 80:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 81:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 83:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 84:     nsz = ns_col[i1];
 85:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 86:   }
 87:   /* allocate space for row pointers */
 88:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 89:   *iia = ia;
 90:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 91:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 93:   /* determine the number of columns in each row */
 94:   ia[0] = oshift;
 95:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 97:     j    = aj + ai[row] + ishift;
 98:     jmax = aj + ai[row+1] + ishift;
 99:     col  = *j++ + ishift;
100:     i2   = tvc[col];
101:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
102:       ia[i1+1]++;
103:       ia[i2+1]++;
104:       i2++;                     /* Start col of next node */
105:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
106:       i2 = tvc[col];
107:     }
108:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
109:   }

111:   /* shift ia[i] to point to next row */
112:   for (i1=1; i1<nslim_row+1; i1++) {
113:     row        = ia[i1-1];
114:     ia[i1]    += row;
115:     work[i1-1] = row - oshift;
116:   }

118:   /* allocate space for column pointers */
119:   nz   = ia[nslim_row] + (!ishift);
120:   PetscMalloc(nz*sizeof(PetscInt),&ja);
121:   *jja = ja;

123:   /* loop over lower triangular part putting into ja */
124:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
125:     j    = aj + ai[row] + ishift;
126:     jmax = aj + ai[row+1] + ishift;
127:     col  = *j++ + ishift;
128:     i2   = tvc[col];
129:     while (i2<i1 && j<jmax) {
130:       ja[work[i2]++] = i1 + oshift;
131:       ja[work[i1]++] = i2 + oshift;
132:       ++i2;
133:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
134:       i2 = tvc[col];
135:     }
136:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

138:   }
139:   PetscFree(work);
140:   PetscFree(tns);
141:   PetscFree(tvc);
142:   return(0);
143: }

145: /*
146:       This builds nonsymmetric version of nonzero structure,
147: */
150: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
151: {
152:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
155:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

158:   nslim_row = a->inode.node_count;
159:   n         = A->cmap->n;

161:   /* Create The column_inode for this matrix */
162:   Mat_CreateColInode(A,&nslim_col,&ns_col);

164:   /* allocate space for reformated column_inode structure */
165:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
166:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
167:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

169:   for (i1=0,col=0; i1<nslim_col; ++i1) {
170:     nsz = ns_col[i1];
171:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
172:   }
173:   /* allocate space for row pointers */
174:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
175:   *iia = ia;
176:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
177:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

179:   /* determine the number of columns in each row */
180:   ia[0] = oshift;
181:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
182:     j   = aj + ai[row] + ishift;
183:     col = *j++ + ishift;
184:     i2  = tvc[col];
185:     nz  = ai[row+1] - ai[row];
186:     while (nz-- > 0) {           /* off-diagonal elemets */
187:       ia[i1+1]++;
188:       i2++;                     /* Start col of next node */
189:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
190:       if (nz > 0) i2 = tvc[col];
191:     }
192:   }

194:   /* shift ia[i] to point to next row */
195:   for (i1=1; i1<nslim_row+1; i1++) {
196:     row        = ia[i1-1];
197:     ia[i1]    += row;
198:     work[i1-1] = row - oshift;
199:   }

201:   /* allocate space for column pointers */
202:   nz   = ia[nslim_row] + (!ishift);
203:   PetscMalloc(nz*sizeof(PetscInt),&ja);
204:   *jja = ja;

206:   /* loop over matrix putting into ja */
207:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
208:     j   = aj + ai[row] + ishift;
209:     col = *j++ + ishift;
210:     i2  = tvc[col];
211:     nz  = ai[row+1] - ai[row];
212:     while (nz-- > 0) {
213:       ja[work[i1]++] = i2 + oshift;
214:       ++i2;
215:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216:       if (nz > 0) i2 = tvc[col];
217:     }
218:   }
219:   PetscFree(ns_col);
220:   PetscFree(work);
221:   PetscFree(tns);
222:   PetscFree(tvc);
223:   return(0);
224: }

228: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
229: {
230:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

234:   *n = a->inode.node_count;
235:   if (!ia) return(0);
236:   if (!blockcompressed) {
237:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
238:   } else if (symmetric) {
239:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
240:   } else {
241:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
242:   }
243:   return(0);
244: }

248: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
249: {

253:   if (!ia) return(0);

255:   if (!blockcompressed) {
256:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
257:   } else {
258:     PetscFree(*ia);
259:     PetscFree(*ja);
260:   }
261:   return(0);
262: }

264: /* ----------------------------------------------------------- */

268: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
272:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

276:   nslim_row = a->inode.node_count;
277:   n         = A->cmap->n;

279:   /* Create The column_inode for this matrix */
280:   Mat_CreateColInode(A,&nslim_col,&ns_col);

282:   /* allocate space for reformated column_inode structure */
283:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

287:   for (i1=0,col=0; i1<nslim_col; ++i1) {
288:     nsz = ns_col[i1];
289:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
290:   }
291:   /* allocate space for column pointers */
292:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
293:   *iia = ia;
294:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
295:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

297:   /* determine the number of columns in each row */
298:   ia[0] = oshift;
299:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
300:     j   = aj + ai[row] + ishift;
301:     col = *j++ + ishift;
302:     i2  = tvc[col];
303:     nz  = ai[row+1] - ai[row];
304:     while (nz-- > 0) {           /* off-diagonal elemets */
305:       /* ia[i1+1]++; */
306:       ia[i2+1]++;
307:       i2++;
308:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
309:       if (nz > 0) i2 = tvc[col];
310:     }
311:   }

313:   /* shift ia[i] to point to next col */
314:   for (i1=1; i1<nslim_col+1; i1++) {
315:     col        = ia[i1-1];
316:     ia[i1]    += col;
317:     work[i1-1] = col - oshift;
318:   }

320:   /* allocate space for column pointers */
321:   nz   = ia[nslim_col] + (!ishift);
322:   PetscMalloc(nz*sizeof(PetscInt),&ja);
323:   *jja = ja;

325:   /* loop over matrix putting into ja */
326:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
327:     j   = aj + ai[row] + ishift;
328:     col = *j++ + ishift;
329:     i2  = tvc[col];
330:     nz  = ai[row+1] - ai[row];
331:     while (nz-- > 0) {
332:       /* ja[work[i1]++] = i2 + oshift; */
333:       ja[work[i2]++] = i1 + oshift;
334:       i2++;
335:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
336:       if (nz > 0) i2 = tvc[col];
337:     }
338:   }
339:   PetscFree(ns_col);
340:   PetscFree(work);
341:   PetscFree(tns);
342:   PetscFree(tvc);
343:   return(0);
344: }

348: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
349: {

353:   Mat_CreateColInode(A,n,NULL);
354:   if (!ia) return(0);

356:   if (!blockcompressed) {
357:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
358:   } else if (symmetric) {
359:     /* Since the indices are symmetric it does'nt matter */
360:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361:   } else {
362:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363:   }
364:   return(0);
365: }

369: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
370: {

374:   if (!ia) return(0);
375:   if (!blockcompressed) {
376:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
377:   } else {
378:     PetscFree(*ia);
379:     PetscFree(*ja);
380:   }
381:   return(0);
382: }

384: /* ----------------------------------------------------------- */

388: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
389: {
390:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
391:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
392:   PetscScalar       *y;
393:   const PetscScalar *x;
394:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
395:   PetscErrorCode    ierr;
396:   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;

398: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
399: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
400: #endif

403:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
404:   node_max = a->inode.node_count;
405:   ns       = a->inode.size;     /* Node Size array */
406:   VecGetArrayRead(xx,&x);
407:   VecGetArray(yy,&y);
408:   idx      = a->j;
409:   v1       = a->a;
410:   ii       = a->i;

412:   for (i = 0,row = 0; i< node_max; ++i) {
413:     nsz         = ns[i];
414:     n           = ii[1] - ii[0];
415:     nonzerorow += (n>0)*nsz;
416:     ii         += nsz;
417:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
418:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
419:     sz = n;                     /* No of non zeros in this row */
420:                                 /* Switch on the size of Node */
421:     switch (nsz) {               /* Each loop in 'case' is unrolled */
422:     case 1:
423:       sum1 = 0.;

425:       for (n = 0; n< sz-1; n+=2) {
426:         i1    = idx[0];         /* The instructions are ordered to */
427:         i2    = idx[1];         /* make the compiler's job easy */
428:         idx  += 2;
429:         tmp0  = x[i1];
430:         tmp1  = x[i2];
431:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
432:       }

434:       if (n == sz-1) {          /* Take care of the last nonzero  */
435:         tmp0  = x[*idx++];
436:         sum1 += *v1++ *tmp0;
437:       }
438:       y[row++]=sum1;
439:       break;
440:     case 2:
441:       sum1 = 0.;
442:       sum2 = 0.;
443:       v2   = v1 + n;

445:       for (n = 0; n< sz-1; n+=2) {
446:         i1    = idx[0];
447:         i2    = idx[1];
448:         idx  += 2;
449:         tmp0  = x[i1];
450:         tmp1  = x[i2];
451:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
452:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
453:       }
454:       if (n == sz-1) {
455:         tmp0  = x[*idx++];
456:         sum1 += *v1++ * tmp0;
457:         sum2 += *v2++ * tmp0;
458:       }
459:       y[row++]=sum1;
460:       y[row++]=sum2;
461:       v1      =v2;              /* Since the next block to be processed starts there*/
462:       idx    +=sz;
463:       break;
464:     case 3:
465:       sum1 = 0.;
466:       sum2 = 0.;
467:       sum3 = 0.;
468:       v2   = v1 + n;
469:       v3   = v2 + n;

471:       for (n = 0; n< sz-1; n+=2) {
472:         i1    = idx[0];
473:         i2    = idx[1];
474:         idx  += 2;
475:         tmp0  = x[i1];
476:         tmp1  = x[i2];
477:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
478:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
479:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
480:       }
481:       if (n == sz-1) {
482:         tmp0  = x[*idx++];
483:         sum1 += *v1++ * tmp0;
484:         sum2 += *v2++ * tmp0;
485:         sum3 += *v3++ * tmp0;
486:       }
487:       y[row++]=sum1;
488:       y[row++]=sum2;
489:       y[row++]=sum3;
490:       v1      =v3;              /* Since the next block to be processed starts there*/
491:       idx    +=2*sz;
492:       break;
493:     case 4:
494:       sum1 = 0.;
495:       sum2 = 0.;
496:       sum3 = 0.;
497:       sum4 = 0.;
498:       v2   = v1 + n;
499:       v3   = v2 + n;
500:       v4   = v3 + n;

502:       for (n = 0; n< sz-1; n+=2) {
503:         i1    = idx[0];
504:         i2    = idx[1];
505:         idx  += 2;
506:         tmp0  = x[i1];
507:         tmp1  = x[i2];
508:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
509:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
510:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
511:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
512:       }
513:       if (n == sz-1) {
514:         tmp0  = x[*idx++];
515:         sum1 += *v1++ * tmp0;
516:         sum2 += *v2++ * tmp0;
517:         sum3 += *v3++ * tmp0;
518:         sum4 += *v4++ * tmp0;
519:       }
520:       y[row++]=sum1;
521:       y[row++]=sum2;
522:       y[row++]=sum3;
523:       y[row++]=sum4;
524:       v1      =v4;              /* Since the next block to be processed starts there*/
525:       idx    +=3*sz;
526:       break;
527:     case 5:
528:       sum1 = 0.;
529:       sum2 = 0.;
530:       sum3 = 0.;
531:       sum4 = 0.;
532:       sum5 = 0.;
533:       v2   = v1 + n;
534:       v3   = v2 + n;
535:       v4   = v3 + n;
536:       v5   = v4 + n;

538:       for (n = 0; n<sz-1; n+=2) {
539:         i1    = idx[0];
540:         i2    = idx[1];
541:         idx  += 2;
542:         tmp0  = x[i1];
543:         tmp1  = x[i2];
544:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
545:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
546:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
547:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
548:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
549:       }
550:       if (n == sz-1) {
551:         tmp0  = x[*idx++];
552:         sum1 += *v1++ * tmp0;
553:         sum2 += *v2++ * tmp0;
554:         sum3 += *v3++ * tmp0;
555:         sum4 += *v4++ * tmp0;
556:         sum5 += *v5++ * tmp0;
557:       }
558:       y[row++]=sum1;
559:       y[row++]=sum2;
560:       y[row++]=sum3;
561:       y[row++]=sum4;
562:       y[row++]=sum5;
563:       v1      =v5;       /* Since the next block to be processed starts there */
564:       idx    +=4*sz;
565:       break;
566:     default:
567:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
568:     }
569:   }
570:   VecRestoreArrayRead(xx,&x);
571:   VecRestoreArray(yy,&y);
572:   PetscLogFlops(2.0*a->nz - nonzerorow);
573:   return(0);
574: }
575: /* ----------------------------------------------------------- */
576: /* Almost same code as the MatMult_SeqAIJ_Inode() */
579: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
580: {
581:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
582:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
583:   MatScalar      *v1,*v2,*v3,*v4,*v5;
584:   PetscScalar    *x,*y,*z,*zt;
586:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;

589:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
590:   node_max = a->inode.node_count;
591:   ns       = a->inode.size;     /* Node Size array */

593:   VecGetArray(xx,&x);
594:   VecGetArray(yy,&y);
595:   if (zz != yy) {
596:     VecGetArray(zz,&z);
597:   } else {
598:     z = y;
599:   }
600:   zt = z;

602:   idx = a->j;
603:   v1  = a->a;
604:   ii  = a->i;

606:   for (i = 0,row = 0; i< node_max; ++i) {
607:     nsz = ns[i];
608:     n   = ii[1] - ii[0];
609:     ii += nsz;
610:     sz  = n;                    /* No of non zeros in this row */
611:                                 /* Switch on the size of Node */
612:     switch (nsz) {               /* Each loop in 'case' is unrolled */
613:     case 1:
614:       sum1 = *zt++;

616:       for (n = 0; n< sz-1; n+=2) {
617:         i1    = idx[0];         /* The instructions are ordered to */
618:         i2    = idx[1];         /* make the compiler's job easy */
619:         idx  += 2;
620:         tmp0  = x[i1];
621:         tmp1  = x[i2];
622:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623:       }

625:       if (n   == sz-1) {          /* Take care of the last nonzero  */
626:         tmp0  = x[*idx++];
627:         sum1 += *v1++ * tmp0;
628:       }
629:       y[row++]=sum1;
630:       break;
631:     case 2:
632:       sum1 = *zt++;
633:       sum2 = *zt++;
634:       v2   = v1 + n;

636:       for (n = 0; n< sz-1; n+=2) {
637:         i1    = idx[0];
638:         i2    = idx[1];
639:         idx  += 2;
640:         tmp0  = x[i1];
641:         tmp1  = x[i2];
642:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644:       }
645:       if (n   == sz-1) {
646:         tmp0  = x[*idx++];
647:         sum1 += *v1++ * tmp0;
648:         sum2 += *v2++ * tmp0;
649:       }
650:       y[row++]=sum1;
651:       y[row++]=sum2;
652:       v1      =v2;              /* Since the next block to be processed starts there*/
653:       idx    +=sz;
654:       break;
655:     case 3:
656:       sum1 = *zt++;
657:       sum2 = *zt++;
658:       sum3 = *zt++;
659:       v2   = v1 + n;
660:       v3   = v2 + n;

662:       for (n = 0; n< sz-1; n+=2) {
663:         i1    = idx[0];
664:         i2    = idx[1];
665:         idx  += 2;
666:         tmp0  = x[i1];
667:         tmp1  = x[i2];
668:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671:       }
672:       if (n == sz-1) {
673:         tmp0  = x[*idx++];
674:         sum1 += *v1++ * tmp0;
675:         sum2 += *v2++ * tmp0;
676:         sum3 += *v3++ * tmp0;
677:       }
678:       y[row++]=sum1;
679:       y[row++]=sum2;
680:       y[row++]=sum3;
681:       v1      =v3;              /* Since the next block to be processed starts there*/
682:       idx    +=2*sz;
683:       break;
684:     case 4:
685:       sum1 = *zt++;
686:       sum2 = *zt++;
687:       sum3 = *zt++;
688:       sum4 = *zt++;
689:       v2   = v1 + n;
690:       v3   = v2 + n;
691:       v4   = v3 + n;

693:       for (n = 0; n< sz-1; n+=2) {
694:         i1    = idx[0];
695:         i2    = idx[1];
696:         idx  += 2;
697:         tmp0  = x[i1];
698:         tmp1  = x[i2];
699:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703:       }
704:       if (n == sz-1) {
705:         tmp0  = x[*idx++];
706:         sum1 += *v1++ * tmp0;
707:         sum2 += *v2++ * tmp0;
708:         sum3 += *v3++ * tmp0;
709:         sum4 += *v4++ * tmp0;
710:       }
711:       y[row++]=sum1;
712:       y[row++]=sum2;
713:       y[row++]=sum3;
714:       y[row++]=sum4;
715:       v1      =v4;              /* Since the next block to be processed starts there*/
716:       idx    +=3*sz;
717:       break;
718:     case 5:
719:       sum1 = *zt++;
720:       sum2 = *zt++;
721:       sum3 = *zt++;
722:       sum4 = *zt++;
723:       sum5 = *zt++;
724:       v2   = v1 + n;
725:       v3   = v2 + n;
726:       v4   = v3 + n;
727:       v5   = v4 + n;

729:       for (n = 0; n<sz-1; n+=2) {
730:         i1    = idx[0];
731:         i2    = idx[1];
732:         idx  += 2;
733:         tmp0  = x[i1];
734:         tmp1  = x[i2];
735:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740:       }
741:       if (n   == sz-1) {
742:         tmp0  = x[*idx++];
743:         sum1 += *v1++ * tmp0;
744:         sum2 += *v2++ * tmp0;
745:         sum3 += *v3++ * tmp0;
746:         sum4 += *v4++ * tmp0;
747:         sum5 += *v5++ * tmp0;
748:       }
749:       y[row++]=sum1;
750:       y[row++]=sum2;
751:       y[row++]=sum3;
752:       y[row++]=sum4;
753:       y[row++]=sum5;
754:       v1      =v5;       /* Since the next block to be processed starts there */
755:       idx    +=4*sz;
756:       break;
757:     default:
758:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759:     }
760:   }
761:   VecRestoreArray(xx,&x);
762:   VecRestoreArray(yy,&y);
763:   if (zz != yy) {
764:     VecRestoreArray(zz,&z);
765:   }
766:   PetscLogFlops(2.0*a->nz);
767:   return(0);
768: }

770: /* ----------------------------------------------------------- */
773: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
774: {
775:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
776:   IS                iscol = a->col,isrow = a->row;
777:   PetscErrorCode    ierr;
778:   const PetscInt    *r,*c,*rout,*cout;
779:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
780:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
781:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
782:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
783:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
784:   const PetscScalar *b;

787:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
788:   node_max = a->inode.node_count;
789:   ns       = a->inode.size;     /* Node Size array */

791:   VecGetArrayRead(bb,&b);
792:   VecGetArray(xx,&x);
793:   tmp  = a->solve_work;

795:   ISGetIndices(isrow,&rout); r = rout;
796:   ISGetIndices(iscol,&cout); c = cout + (n-1);

798:   /* forward solve the lower triangular */
799:   tmps = tmp;
800:   aa   = a_a;
801:   aj   = a_j;
802:   ad   = a->diag;

804:   for (i = 0,row = 0; i< node_max; ++i) {
805:     nsz = ns[i];
806:     aii = ai[row];
807:     v1  = aa + aii;
808:     vi  = aj + aii;
809:     nz  = ad[row]- aii;
810:     if (i < node_max-1) {
811:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
812:       * but our indexing to determine it's size could. */
813:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
814:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
815:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
816:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
817:     }

819:     switch (nsz) {               /* Each loop in 'case' is unrolled */
820:     case 1:
821:       sum1 = b[*r++];
822:       for (j=0; j<nz-1; j+=2) {
823:         i0    = vi[0];
824:         i1    = vi[1];
825:         vi   +=2;
826:         tmp0  = tmps[i0];
827:         tmp1  = tmps[i1];
828:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
829:       }
830:       if (j == nz-1) {
831:         tmp0  = tmps[*vi++];
832:         sum1 -= *v1++ *tmp0;
833:       }
834:       tmp[row++]=sum1;
835:       break;
836:     case 2:
837:       sum1 = b[*r++];
838:       sum2 = b[*r++];
839:       v2   = aa + ai[row+1];

841:       for (j=0; j<nz-1; j+=2) {
842:         i0    = vi[0];
843:         i1    = vi[1];
844:         vi   +=2;
845:         tmp0  = tmps[i0];
846:         tmp1  = tmps[i1];
847:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849:       }
850:       if (j == nz-1) {
851:         tmp0  = tmps[*vi++];
852:         sum1 -= *v1++ *tmp0;
853:         sum2 -= *v2++ *tmp0;
854:       }
855:       sum2     -= *v2++ *sum1;
856:       tmp[row++]=sum1;
857:       tmp[row++]=sum2;
858:       break;
859:     case 3:
860:       sum1 = b[*r++];
861:       sum2 = b[*r++];
862:       sum3 = b[*r++];
863:       v2   = aa + ai[row+1];
864:       v3   = aa + ai[row+2];

866:       for (j=0; j<nz-1; j+=2) {
867:         i0    = vi[0];
868:         i1    = vi[1];
869:         vi   +=2;
870:         tmp0  = tmps[i0];
871:         tmp1  = tmps[i1];
872:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
873:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
874:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
875:       }
876:       if (j == nz-1) {
877:         tmp0  = tmps[*vi++];
878:         sum1 -= *v1++ *tmp0;
879:         sum2 -= *v2++ *tmp0;
880:         sum3 -= *v3++ *tmp0;
881:       }
882:       sum2 -= *v2++ * sum1;
883:       sum3 -= *v3++ * sum1;
884:       sum3 -= *v3++ * sum2;

886:       tmp[row++]=sum1;
887:       tmp[row++]=sum2;
888:       tmp[row++]=sum3;
889:       break;

891:     case 4:
892:       sum1 = b[*r++];
893:       sum2 = b[*r++];
894:       sum3 = b[*r++];
895:       sum4 = b[*r++];
896:       v2   = aa + ai[row+1];
897:       v3   = aa + ai[row+2];
898:       v4   = aa + ai[row+3];

900:       for (j=0; j<nz-1; j+=2) {
901:         i0    = vi[0];
902:         i1    = vi[1];
903:         vi   +=2;
904:         tmp0  = tmps[i0];
905:         tmp1  = tmps[i1];
906:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
907:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
908:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
909:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
910:       }
911:       if (j == nz-1) {
912:         tmp0  = tmps[*vi++];
913:         sum1 -= *v1++ *tmp0;
914:         sum2 -= *v2++ *tmp0;
915:         sum3 -= *v3++ *tmp0;
916:         sum4 -= *v4++ *tmp0;
917:       }
918:       sum2 -= *v2++ * sum1;
919:       sum3 -= *v3++ * sum1;
920:       sum4 -= *v4++ * sum1;
921:       sum3 -= *v3++ * sum2;
922:       sum4 -= *v4++ * sum2;
923:       sum4 -= *v4++ * sum3;

925:       tmp[row++]=sum1;
926:       tmp[row++]=sum2;
927:       tmp[row++]=sum3;
928:       tmp[row++]=sum4;
929:       break;
930:     case 5:
931:       sum1 = b[*r++];
932:       sum2 = b[*r++];
933:       sum3 = b[*r++];
934:       sum4 = b[*r++];
935:       sum5 = b[*r++];
936:       v2   = aa + ai[row+1];
937:       v3   = aa + ai[row+2];
938:       v4   = aa + ai[row+3];
939:       v5   = aa + ai[row+4];

941:       for (j=0; j<nz-1; j+=2) {
942:         i0    = vi[0];
943:         i1    = vi[1];
944:         vi   +=2;
945:         tmp0  = tmps[i0];
946:         tmp1  = tmps[i1];
947:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
948:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
949:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
950:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
951:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
952:       }
953:       if (j == nz-1) {
954:         tmp0  = tmps[*vi++];
955:         sum1 -= *v1++ *tmp0;
956:         sum2 -= *v2++ *tmp0;
957:         sum3 -= *v3++ *tmp0;
958:         sum4 -= *v4++ *tmp0;
959:         sum5 -= *v5++ *tmp0;
960:       }

962:       sum2 -= *v2++ * sum1;
963:       sum3 -= *v3++ * sum1;
964:       sum4 -= *v4++ * sum1;
965:       sum5 -= *v5++ * sum1;
966:       sum3 -= *v3++ * sum2;
967:       sum4 -= *v4++ * sum2;
968:       sum5 -= *v5++ * sum2;
969:       sum4 -= *v4++ * sum3;
970:       sum5 -= *v5++ * sum3;
971:       sum5 -= *v5++ * sum4;

973:       tmp[row++]=sum1;
974:       tmp[row++]=sum2;
975:       tmp[row++]=sum3;
976:       tmp[row++]=sum4;
977:       tmp[row++]=sum5;
978:       break;
979:     default:
980:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
981:     }
982:   }
983:   /* backward solve the upper triangular */
984:   for (i=node_max -1,row = n-1; i>=0; i--) {
985:     nsz = ns[i];
986:     aii = ai[row+1] -1;
987:     v1  = aa + aii;
988:     vi  = aj + aii;
989:     nz  = aii- ad[row];
990:     switch (nsz) {               /* Each loop in 'case' is unrolled */
991:     case 1:
992:       sum1 = tmp[row];

994:       for (j=nz; j>1; j-=2) {
995:         vi   -=2;
996:         i0    = vi[2];
997:         i1    = vi[1];
998:         tmp0  = tmps[i0];
999:         tmp1  = tmps[i1];
1000:         v1   -= 2;
1001:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002:       }
1003:       if (j==1) {
1004:         tmp0  = tmps[*vi--];
1005:         sum1 -= *v1-- * tmp0;
1006:       }
1007:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1008:       break;
1009:     case 2:
1010:       sum1 = tmp[row];
1011:       sum2 = tmp[row -1];
1012:       v2   = aa + ai[row]-1;
1013:       for (j=nz; j>1; j-=2) {
1014:         vi   -=2;
1015:         i0    = vi[2];
1016:         i1    = vi[1];
1017:         tmp0  = tmps[i0];
1018:         tmp1  = tmps[i1];
1019:         v1   -= 2;
1020:         v2   -= 2;
1021:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023:       }
1024:       if (j==1) {
1025:         tmp0  = tmps[*vi--];
1026:         sum1 -= *v1-- * tmp0;
1027:         sum2 -= *v2-- * tmp0;
1028:       }

1030:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1031:       sum2   -= *v2-- * tmp0;
1032:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1033:       break;
1034:     case 3:
1035:       sum1 = tmp[row];
1036:       sum2 = tmp[row -1];
1037:       sum3 = tmp[row -2];
1038:       v2   = aa + ai[row]-1;
1039:       v3   = aa + ai[row -1]-1;
1040:       for (j=nz; j>1; j-=2) {
1041:         vi   -=2;
1042:         i0    = vi[2];
1043:         i1    = vi[1];
1044:         tmp0  = tmps[i0];
1045:         tmp1  = tmps[i1];
1046:         v1   -= 2;
1047:         v2   -= 2;
1048:         v3   -= 2;
1049:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1050:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1051:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1052:       }
1053:       if (j==1) {
1054:         tmp0  = tmps[*vi--];
1055:         sum1 -= *v1-- * tmp0;
1056:         sum2 -= *v2-- * tmp0;
1057:         sum3 -= *v3-- * tmp0;
1058:       }
1059:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060:       sum2   -= *v2-- * tmp0;
1061:       sum3   -= *v3-- * tmp0;
1062:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1063:       sum3   -= *v3-- * tmp0;
1064:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1066:       break;
1067:     case 4:
1068:       sum1 = tmp[row];
1069:       sum2 = tmp[row -1];
1070:       sum3 = tmp[row -2];
1071:       sum4 = tmp[row -3];
1072:       v2   = aa + ai[row]-1;
1073:       v3   = aa + ai[row -1]-1;
1074:       v4   = aa + ai[row -2]-1;

1076:       for (j=nz; j>1; j-=2) {
1077:         vi   -=2;
1078:         i0    = vi[2];
1079:         i1    = vi[1];
1080:         tmp0  = tmps[i0];
1081:         tmp1  = tmps[i1];
1082:         v1   -= 2;
1083:         v2   -= 2;
1084:         v3   -= 2;
1085:         v4   -= 2;
1086:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1087:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1088:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1089:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1090:       }
1091:       if (j==1) {
1092:         tmp0  = tmps[*vi--];
1093:         sum1 -= *v1-- * tmp0;
1094:         sum2 -= *v2-- * tmp0;
1095:         sum3 -= *v3-- * tmp0;
1096:         sum4 -= *v4-- * tmp0;
1097:       }

1099:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1100:       sum2   -= *v2-- * tmp0;
1101:       sum3   -= *v3-- * tmp0;
1102:       sum4   -= *v4-- * tmp0;
1103:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1104:       sum3   -= *v3-- * tmp0;
1105:       sum4   -= *v4-- * tmp0;
1106:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1107:       sum4   -= *v4-- * tmp0;
1108:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1109:       break;
1110:     case 5:
1111:       sum1 = tmp[row];
1112:       sum2 = tmp[row -1];
1113:       sum3 = tmp[row -2];
1114:       sum4 = tmp[row -3];
1115:       sum5 = tmp[row -4];
1116:       v2   = aa + ai[row]-1;
1117:       v3   = aa + ai[row -1]-1;
1118:       v4   = aa + ai[row -2]-1;
1119:       v5   = aa + ai[row -3]-1;
1120:       for (j=nz; j>1; j-=2) {
1121:         vi   -= 2;
1122:         i0    = vi[2];
1123:         i1    = vi[1];
1124:         tmp0  = tmps[i0];
1125:         tmp1  = tmps[i1];
1126:         v1   -= 2;
1127:         v2   -= 2;
1128:         v3   -= 2;
1129:         v4   -= 2;
1130:         v5   -= 2;
1131:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1132:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1133:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1134:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1135:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1136:       }
1137:       if (j==1) {
1138:         tmp0  = tmps[*vi--];
1139:         sum1 -= *v1-- * tmp0;
1140:         sum2 -= *v2-- * tmp0;
1141:         sum3 -= *v3-- * tmp0;
1142:         sum4 -= *v4-- * tmp0;
1143:         sum5 -= *v5-- * tmp0;
1144:       }

1146:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1147:       sum2   -= *v2-- * tmp0;
1148:       sum3   -= *v3-- * tmp0;
1149:       sum4   -= *v4-- * tmp0;
1150:       sum5   -= *v5-- * tmp0;
1151:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1152:       sum3   -= *v3-- * tmp0;
1153:       sum4   -= *v4-- * tmp0;
1154:       sum5   -= *v5-- * tmp0;
1155:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1156:       sum4   -= *v4-- * tmp0;
1157:       sum5   -= *v5-- * tmp0;
1158:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1159:       sum5   -= *v5-- * tmp0;
1160:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1161:       break;
1162:     default:
1163:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1164:     }
1165:   }
1166:   ISRestoreIndices(isrow,&rout);
1167:   ISRestoreIndices(iscol,&cout);
1168:   VecRestoreArrayRead(bb,&b);
1169:   VecRestoreArray(xx,&x);
1170:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1171:   return(0);
1172: }

1176: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1177: {
1178:   Mat             C     =B;
1179:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1180:   IS              isrow = b->row,isicol = b->icol;
1181:   PetscErrorCode  ierr;
1182:   const PetscInt  *r,*ic,*ics;
1183:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1184:   PetscInt        i,j,k,nz,nzL,row,*pj;
1185:   const PetscInt  *ajtmp,*bjtmp;
1186:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1187:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1188:   FactorShiftCtx  sctx;
1189:   const PetscInt  *ddiag;
1190:   PetscReal       rs;
1191:   MatScalar       d;
1192:   PetscInt        inod,nodesz,node_max,col;
1193:   const PetscInt  *ns;
1194:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1197:   /* MatPivotSetUp(): initialize shift context sctx */
1198:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1200:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1201:     ddiag          = a->diag;
1202:     sctx.shift_top = info->zeropivot;
1203:     for (i=0; i<n; i++) {
1204:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1205:       d  = (aa)[ddiag[i]];
1206:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1207:       v  = aa+ai[i];
1208:       nz = ai[i+1] - ai[i];
1209:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1210:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1211:     }
1212:     sctx.shift_top *= 1.1;
1213:     sctx.nshift_max = 5;
1214:     sctx.shift_lo   = 0.;
1215:     sctx.shift_hi   = 1.;
1216:   }

1218:   ISGetIndices(isrow,&r);
1219:   ISGetIndices(isicol,&ic);

1221:   PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);
1222:   PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));
1223:   rtmp2 = rtmp1 + n;
1224:   rtmp3 = rtmp2 + n;
1225:   rtmp4 = rtmp3 + n;
1226:   ics   = ic;

1228:   node_max = a->inode.node_count;
1229:   ns       = a->inode.size;
1230:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1232:   /* If max inode size > 4, split it into two inodes.*/
1233:   /* also map the inode sizes according to the ordering */
1234:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1235:   for (i=0,j=0; i<node_max; ++i,++j) {
1236:     if (ns[i] > 4) {
1237:       tmp_vec1[j] = 4;
1238:       ++j;
1239:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1240:     } else {
1241:       tmp_vec1[j] = ns[i];
1242:     }
1243:   }
1244:   /* Use the correct node_max */
1245:   node_max = j;

1247:   /* Now reorder the inode info based on mat re-ordering info */
1248:   /* First create a row -> inode_size_array_index map */
1249:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1250:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1251:   for (i=0,row=0; i<node_max; i++) {
1252:     nodesz = tmp_vec1[i];
1253:     for (j=0; j<nodesz; j++,row++) {
1254:       nsmap[row] = i;
1255:     }
1256:   }
1257:   /* Using nsmap, create a reordered ns structure */
1258:   for (i=0,j=0; i< node_max; i++) {
1259:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1260:     tmp_vec2[i] = nodesz;
1261:     j          += nodesz;
1262:   }
1263:   PetscFree(nsmap);
1264:   PetscFree(tmp_vec1);

1266:   /* Now use the correct ns */
1267:   ns = tmp_vec2;

1269:   do {
1270:     sctx.newshift = PETSC_FALSE;
1271:     /* Now loop over each block-row, and do the factorization */
1272:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1273:       nodesz = ns[inod];

1275:       switch (nodesz) {
1276:       case 1:
1277:         /*----------*/
1278:         /* zero rtmp1 */
1279:         /* L part */
1280:         nz    = bi[i+1] - bi[i];
1281:         bjtmp = bj + bi[i];
1282:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1284:         /* U part */
1285:         nz    = bdiag[i]-bdiag[i+1];
1286:         bjtmp = bj + bdiag[i+1]+1;
1287:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1289:         /* load in initial (unfactored row) */
1290:         nz    = ai[r[i]+1] - ai[r[i]];
1291:         ajtmp = aj + ai[r[i]];
1292:         v     = aa + ai[r[i]];
1293:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1295:         /* ZeropivotApply() */
1296:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1298:         /* elimination */
1299:         bjtmp = bj + bi[i];
1300:         row   = *bjtmp++;
1301:         nzL   = bi[i+1] - bi[i];
1302:         for (k=0; k < nzL; k++) {
1303:           pc = rtmp1 + row;
1304:           if (*pc != 0.0) {
1305:             pv   = b->a + bdiag[row];
1306:             mul1 = *pc * (*pv);
1307:             *pc  = mul1;
1308:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1309:             pv   = b->a + bdiag[row+1]+1;
1310:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1311:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1312:             PetscLogFlops(1+2*nz);
1313:           }
1314:           row = *bjtmp++;
1315:         }

1317:         /* finished row so stick it into b->a */
1318:         rs = 0.0;
1319:         /* L part */
1320:         pv = b->a + bi[i];
1321:         pj = b->j + bi[i];
1322:         nz = bi[i+1] - bi[i];
1323:         for (j=0; j<nz; j++) {
1324:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1325:         }

1327:         /* U part */
1328:         pv = b->a + bdiag[i+1]+1;
1329:         pj = b->j + bdiag[i+1]+1;
1330:         nz = bdiag[i] - bdiag[i+1]-1;
1331:         for (j=0; j<nz; j++) {
1332:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333:         }

1335:         /* Check zero pivot */
1336:         sctx.rs = rs;
1337:         sctx.pv = rtmp1[i];
1338:         MatPivotCheck(A,info,&sctx,i);
1339:         if (sctx.newshift) break;

1341:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1342:         pv  = b->a + bdiag[i];
1343:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1344:         break;

1346:       case 2:
1347:         /*----------*/
1348:         /* zero rtmp1 and rtmp2 */
1349:         /* L part */
1350:         nz    = bi[i+1] - bi[i];
1351:         bjtmp = bj + bi[i];
1352:         for  (j=0; j<nz; j++) {
1353:           col        = bjtmp[j];
1354:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1355:         }

1357:         /* U part */
1358:         nz    = bdiag[i]-bdiag[i+1];
1359:         bjtmp = bj + bdiag[i+1]+1;
1360:         for  (j=0; j<nz; j++) {
1361:           col        = bjtmp[j];
1362:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363:         }

1365:         /* load in initial (unfactored row) */
1366:         nz    = ai[r[i]+1] - ai[r[i]];
1367:         ajtmp = aj + ai[r[i]];
1368:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1369:         for (j=0; j<nz; j++) {
1370:           col        = ics[ajtmp[j]];
1371:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1372:         }
1373:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1374:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1376:         /* elimination */
1377:         bjtmp = bj + bi[i];
1378:         row   = *bjtmp++; /* pivot row */
1379:         nzL   = bi[i+1] - bi[i];
1380:         for (k=0; k < nzL; k++) {
1381:           pc1 = rtmp1 + row;
1382:           pc2 = rtmp2 + row;
1383:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1384:             pv   = b->a + bdiag[row];
1385:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1386:             *pc1 = mul1;       *pc2 = mul2;

1388:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1389:             pv = b->a + bdiag[row+1]+1;
1390:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1391:             for (j=0; j<nz; j++) {
1392:               col         = pj[j];
1393:               rtmp1[col] -= mul1 * pv[j];
1394:               rtmp2[col] -= mul2 * pv[j];
1395:             }
1396:             PetscLogFlops(2+4*nz);
1397:           }
1398:           row = *bjtmp++;
1399:         }

1401:         /* finished row i; check zero pivot, then stick row i into b->a */
1402:         rs = 0.0;
1403:         /* L part */
1404:         pc1 = b->a + bi[i];
1405:         pj  = b->j + bi[i];
1406:         nz  = bi[i+1] - bi[i];
1407:         for (j=0; j<nz; j++) {
1408:           col    = pj[j];
1409:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1410:         }
1411:         /* U part */
1412:         pc1 = b->a + bdiag[i+1]+1;
1413:         pj  = b->j + bdiag[i+1]+1;
1414:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1415:         for (j=0; j<nz; j++) {
1416:           col    = pj[j];
1417:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418:         }

1420:         sctx.rs = rs;
1421:         sctx.pv = rtmp1[i];
1422:         MatPivotCheck(A,info,&sctx,i);
1423:         if (sctx.newshift) break;
1424:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1425:         *pc1 = 1.0/sctx.pv;

1427:         /* Now take care of diagonal 2x2 block. */
1428:         pc2 = rtmp2 + i;
1429:         if (*pc2 != 0.0) {
1430:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1431:           *pc2 = mul1;          /* insert L entry */
1432:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1433:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1434:           for (j=0; j<nz; j++) {
1435:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1436:           }
1437:           PetscLogFlops(1+2*nz);
1438:         }

1440:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1441:         rs = 0.0;
1442:         /* L part */
1443:         pc2 = b->a + bi[i+1];
1444:         pj  = b->j + bi[i+1];
1445:         nz  = bi[i+2] - bi[i+1];
1446:         for (j=0; j<nz; j++) {
1447:           col    = pj[j];
1448:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1449:         }
1450:         /* U part */
1451:         pc2 = b->a + bdiag[i+2]+1;
1452:         pj  = b->j + bdiag[i+2]+1;
1453:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1454:         for (j=0; j<nz; j++) {
1455:           col    = pj[j];
1456:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457:         }

1459:         sctx.rs = rs;
1460:         sctx.pv = rtmp2[i+1];
1461:         MatPivotCheck(A,info,&sctx,i+1);
1462:         if (sctx.newshift) break;
1463:         pc2  = b->a + bdiag[i+1];
1464:         *pc2 = 1.0/sctx.pv;
1465:         break;

1467:       case 3:
1468:         /*----------*/
1469:         /* zero rtmp */
1470:         /* L part */
1471:         nz    = bi[i+1] - bi[i];
1472:         bjtmp = bj + bi[i];
1473:         for  (j=0; j<nz; j++) {
1474:           col        = bjtmp[j];
1475:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1476:         }

1478:         /* U part */
1479:         nz    = bdiag[i]-bdiag[i+1];
1480:         bjtmp = bj + bdiag[i+1]+1;
1481:         for  (j=0; j<nz; j++) {
1482:           col        = bjtmp[j];
1483:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484:         }

1486:         /* load in initial (unfactored row) */
1487:         nz    = ai[r[i]+1] - ai[r[i]];
1488:         ajtmp = aj + ai[r[i]];
1489:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1490:         for (j=0; j<nz; j++) {
1491:           col        = ics[ajtmp[j]];
1492:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1493:         }
1494:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1495:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1497:         /* elimination */
1498:         bjtmp = bj + bi[i];
1499:         row   = *bjtmp++; /* pivot row */
1500:         nzL   = bi[i+1] - bi[i];
1501:         for (k=0; k < nzL; k++) {
1502:           pc1 = rtmp1 + row;
1503:           pc2 = rtmp2 + row;
1504:           pc3 = rtmp3 + row;
1505:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1506:             pv   = b->a + bdiag[row];
1507:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1508:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1510:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1511:             pv = b->a + bdiag[row+1]+1;
1512:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1513:             for (j=0; j<nz; j++) {
1514:               col         = pj[j];
1515:               rtmp1[col] -= mul1 * pv[j];
1516:               rtmp2[col] -= mul2 * pv[j];
1517:               rtmp3[col] -= mul3 * pv[j];
1518:             }
1519:             PetscLogFlops(3+6*nz);
1520:           }
1521:           row = *bjtmp++;
1522:         }

1524:         /* finished row i; check zero pivot, then stick row i into b->a */
1525:         rs = 0.0;
1526:         /* L part */
1527:         pc1 = b->a + bi[i];
1528:         pj  = b->j + bi[i];
1529:         nz  = bi[i+1] - bi[i];
1530:         for (j=0; j<nz; j++) {
1531:           col    = pj[j];
1532:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1533:         }
1534:         /* U part */
1535:         pc1 = b->a + bdiag[i+1]+1;
1536:         pj  = b->j + bdiag[i+1]+1;
1537:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1538:         for (j=0; j<nz; j++) {
1539:           col    = pj[j];
1540:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541:         }

1543:         sctx.rs = rs;
1544:         sctx.pv = rtmp1[i];
1545:         MatPivotCheck(A,info,&sctx,i);
1546:         if (sctx.newshift) break;
1547:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1548:         *pc1 = 1.0/sctx.pv;

1550:         /* Now take care of 1st column of diagonal 3x3 block. */
1551:         pc2 = rtmp2 + i;
1552:         pc3 = rtmp3 + i;
1553:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1554:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1555:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1556:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1557:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1558:           for (j=0; j<nz; j++) {
1559:             col         = pj[j];
1560:             rtmp2[col] -= mul2 * rtmp1[col];
1561:             rtmp3[col] -= mul3 * rtmp1[col];
1562:           }
1563:           PetscLogFlops(2+4*nz);
1564:         }

1566:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1567:         rs = 0.0;
1568:         /* L part */
1569:         pc2 = b->a + bi[i+1];
1570:         pj  = b->j + bi[i+1];
1571:         nz  = bi[i+2] - bi[i+1];
1572:         for (j=0; j<nz; j++) {
1573:           col    = pj[j];
1574:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1575:         }
1576:         /* U part */
1577:         pc2 = b->a + bdiag[i+2]+1;
1578:         pj  = b->j + bdiag[i+2]+1;
1579:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1580:         for (j=0; j<nz; j++) {
1581:           col    = pj[j];
1582:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583:         }

1585:         sctx.rs = rs;
1586:         sctx.pv = rtmp2[i+1];
1587:         MatPivotCheck(A,info,&sctx,i+1);
1588:         if (sctx.newshift) break;
1589:         pc2  = b->a + bdiag[i+1];
1590:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1592:         /* Now take care of 2nd column of diagonal 3x3 block. */
1593:         pc3 = rtmp3 + i+1;
1594:         if (*pc3 != 0.0) {
1595:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1596:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1597:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1598:           for (j=0; j<nz; j++) {
1599:             col         = pj[j];
1600:             rtmp3[col] -= mul3 * rtmp2[col];
1601:           }
1602:           PetscLogFlops(1+2*nz);
1603:         }

1605:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1606:         rs = 0.0;
1607:         /* L part */
1608:         pc3 = b->a + bi[i+2];
1609:         pj  = b->j + bi[i+2];
1610:         nz  = bi[i+3] - bi[i+2];
1611:         for (j=0; j<nz; j++) {
1612:           col    = pj[j];
1613:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1614:         }
1615:         /* U part */
1616:         pc3 = b->a + bdiag[i+3]+1;
1617:         pj  = b->j + bdiag[i+3]+1;
1618:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1619:         for (j=0; j<nz; j++) {
1620:           col    = pj[j];
1621:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622:         }

1624:         sctx.rs = rs;
1625:         sctx.pv = rtmp3[i+2];
1626:         MatPivotCheck(A,info,&sctx,i+2);
1627:         if (sctx.newshift) break;
1628:         pc3  = b->a + bdiag[i+2];
1629:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1630:         break;
1631:       case 4:
1632:         /*----------*/
1633:         /* zero rtmp */
1634:         /* L part */
1635:         nz    = bi[i+1] - bi[i];
1636:         bjtmp = bj + bi[i];
1637:         for  (j=0; j<nz; j++) {
1638:           col        = bjtmp[j];
1639:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1640:         }

1642:         /* U part */
1643:         nz    = bdiag[i]-bdiag[i+1];
1644:         bjtmp = bj + bdiag[i+1]+1;
1645:         for  (j=0; j<nz; j++) {
1646:           col        = bjtmp[j];
1647:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1648:         }

1650:         /* load in initial (unfactored row) */
1651:         nz    = ai[r[i]+1] - ai[r[i]];
1652:         ajtmp = aj + ai[r[i]];
1653:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1654:         for (j=0; j<nz; j++) {
1655:           col        = ics[ajtmp[j]];
1656:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1657:         }
1658:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1659:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1661:         /* elimination */
1662:         bjtmp = bj + bi[i];
1663:         row   = *bjtmp++; /* pivot row */
1664:         nzL   = bi[i+1] - bi[i];
1665:         for (k=0; k < nzL; k++) {
1666:           pc1 = rtmp1 + row;
1667:           pc2 = rtmp2 + row;
1668:           pc3 = rtmp3 + row;
1669:           pc4 = rtmp4 + row;
1670:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1671:             pv   = b->a + bdiag[row];
1672:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1673:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1675:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1676:             pv = b->a + bdiag[row+1]+1;
1677:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1678:             for (j=0; j<nz; j++) {
1679:               col         = pj[j];
1680:               rtmp1[col] -= mul1 * pv[j];
1681:               rtmp2[col] -= mul2 * pv[j];
1682:               rtmp3[col] -= mul3 * pv[j];
1683:               rtmp4[col] -= mul4 * pv[j];
1684:             }
1685:             PetscLogFlops(4+8*nz);
1686:           }
1687:           row = *bjtmp++;
1688:         }

1690:         /* finished row i; check zero pivot, then stick row i into b->a */
1691:         rs = 0.0;
1692:         /* L part */
1693:         pc1 = b->a + bi[i];
1694:         pj  = b->j + bi[i];
1695:         nz  = bi[i+1] - bi[i];
1696:         for (j=0; j<nz; j++) {
1697:           col    = pj[j];
1698:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1699:         }
1700:         /* U part */
1701:         pc1 = b->a + bdiag[i+1]+1;
1702:         pj  = b->j + bdiag[i+1]+1;
1703:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1704:         for (j=0; j<nz; j++) {
1705:           col    = pj[j];
1706:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707:         }

1709:         sctx.rs = rs;
1710:         sctx.pv = rtmp1[i];
1711:         MatPivotCheck(A,info,&sctx,i);
1712:         if (sctx.newshift) break;
1713:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1714:         *pc1 = 1.0/sctx.pv;

1716:         /* Now take care of 1st column of diagonal 4x4 block. */
1717:         pc2 = rtmp2 + i;
1718:         pc3 = rtmp3 + i;
1719:         pc4 = rtmp4 + i;
1720:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1722:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1723:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1724:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1725:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1726:           for (j=0; j<nz; j++) {
1727:             col         = pj[j];
1728:             rtmp2[col] -= mul2 * rtmp1[col];
1729:             rtmp3[col] -= mul3 * rtmp1[col];
1730:             rtmp4[col] -= mul4 * rtmp1[col];
1731:           }
1732:           PetscLogFlops(3+6*nz);
1733:         }

1735:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1736:         rs = 0.0;
1737:         /* L part */
1738:         pc2 = b->a + bi[i+1];
1739:         pj  = b->j + bi[i+1];
1740:         nz  = bi[i+2] - bi[i+1];
1741:         for (j=0; j<nz; j++) {
1742:           col    = pj[j];
1743:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1744:         }
1745:         /* U part */
1746:         pc2 = b->a + bdiag[i+2]+1;
1747:         pj  = b->j + bdiag[i+2]+1;
1748:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1749:         for (j=0; j<nz; j++) {
1750:           col    = pj[j];
1751:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752:         }

1754:         sctx.rs = rs;
1755:         sctx.pv = rtmp2[i+1];
1756:         MatPivotCheck(A,info,&sctx,i+1);
1757:         if (sctx.newshift) break;
1758:         pc2  = b->a + bdiag[i+1];
1759:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1761:         /* Now take care of 2nd column of diagonal 4x4 block. */
1762:         pc3 = rtmp3 + i+1;
1763:         pc4 = rtmp4 + i+1;
1764:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1765:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1766:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1767:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1768:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1769:           for (j=0; j<nz; j++) {
1770:             col         = pj[j];
1771:             rtmp3[col] -= mul3 * rtmp2[col];
1772:             rtmp4[col] -= mul4 * rtmp2[col];
1773:           }
1774:           PetscLogFlops(4*nz);
1775:         }

1777:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1778:         rs = 0.0;
1779:         /* L part */
1780:         pc3 = b->a + bi[i+2];
1781:         pj  = b->j + bi[i+2];
1782:         nz  = bi[i+3] - bi[i+2];
1783:         for (j=0; j<nz; j++) {
1784:           col    = pj[j];
1785:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1786:         }
1787:         /* U part */
1788:         pc3 = b->a + bdiag[i+3]+1;
1789:         pj  = b->j + bdiag[i+3]+1;
1790:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1791:         for (j=0; j<nz; j++) {
1792:           col    = pj[j];
1793:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794:         }

1796:         sctx.rs = rs;
1797:         sctx.pv = rtmp3[i+2];
1798:         MatPivotCheck(A,info,&sctx,i+2);
1799:         if (sctx.newshift) break;
1800:         pc3  = b->a + bdiag[i+2];
1801:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1803:         /* Now take care of 3rd column of diagonal 4x4 block. */
1804:         pc4 = rtmp4 + i+2;
1805:         if (*pc4 != 0.0) {
1806:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1807:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1808:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1809:           for (j=0; j<nz; j++) {
1810:             col         = pj[j];
1811:             rtmp4[col] -= mul4 * rtmp3[col];
1812:           }
1813:           PetscLogFlops(1+2*nz);
1814:         }

1816:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1817:         rs = 0.0;
1818:         /* L part */
1819:         pc4 = b->a + bi[i+3];
1820:         pj  = b->j + bi[i+3];
1821:         nz  = bi[i+4] - bi[i+3];
1822:         for (j=0; j<nz; j++) {
1823:           col    = pj[j];
1824:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1825:         }
1826:         /* U part */
1827:         pc4 = b->a + bdiag[i+4]+1;
1828:         pj  = b->j + bdiag[i+4]+1;
1829:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1830:         for (j=0; j<nz; j++) {
1831:           col    = pj[j];
1832:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833:         }

1835:         sctx.rs = rs;
1836:         sctx.pv = rtmp4[i+3];
1837:         MatPivotCheck(A,info,&sctx,i+3);
1838:         if (sctx.newshift) break;
1839:         pc4  = b->a + bdiag[i+3];
1840:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1841:         break;

1843:       default:
1844:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1845:       }
1846:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1847:       i += nodesz;                 /* Update the row */
1848:     }

1850:     /* MatPivotRefine() */
1851:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1852:       /*
1853:        * if no shift in this attempt & shifting & started shifting & can refine,
1854:        * then try lower shift
1855:        */
1856:       sctx.shift_hi       = sctx.shift_fraction;
1857:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1859:       sctx.newshift       = PETSC_TRUE;
1860:       sctx.nshift++;
1861:     }
1862:   } while (sctx.newshift);

1864:   PetscFree(rtmp1);
1865:   PetscFree(tmp_vec2);
1866:   ISRestoreIndices(isicol,&ic);
1867:   ISRestoreIndices(isrow,&r);

1869:   C->ops->solve             = MatSolve_SeqAIJ;
1870:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1871:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1872:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1873:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1874:   C->assembled              = PETSC_TRUE;
1875:   C->preallocated           = PETSC_TRUE;

1877:   PetscLogFlops(C->cmap->n);

1879:   /* MatShiftView(A,info,&sctx) */
1880:   if (sctx.nshift) {
1881:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1882:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1883:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1884:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1885:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1886:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1887:     }
1888:   }
1889:   Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1890:   return(0);
1891: }

1895: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1896: {
1897:   Mat             C     = B;
1898:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1899:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1900:   PetscErrorCode  ierr;
1901:   const PetscInt  *r,*ic,*c,*ics;
1902:   PetscInt        n   = A->rmap->n,*bi = b->i;
1903:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1904:   PetscInt        i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1905:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1906:   PetscScalar     mul1,mul2,mul3,tmp;
1907:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1908:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1909:   PetscReal       rs=0.0;
1910:   FactorShiftCtx  sctx;

1913:   sctx.shift_top      = 0;
1914:   sctx.nshift_max     = 0;
1915:   sctx.shift_lo       = 0;
1916:   sctx.shift_hi       = 0;
1917:   sctx.shift_fraction = 0;

1919:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1920:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921:     sctx.shift_top = 0;
1922:     for (i=0; i<n; i++) {
1923:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924:       rs    = 0.0;
1925:       ajtmp = aj + ai[i];
1926:       rtmp1 = aa + ai[i];
1927:       nz    = ai[i+1] - ai[i];
1928:       for (j=0; j<nz; j++) {
1929:         if (*ajtmp != i) {
1930:           rs += PetscAbsScalar(*rtmp1++);
1931:         } else {
1932:           rs -= PetscRealPart(*rtmp1++);
1933:         }
1934:         ajtmp++;
1935:       }
1936:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1937:     }
1938:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939:     sctx.shift_top *= 1.1;
1940:     sctx.nshift_max = 5;
1941:     sctx.shift_lo   = 0.;
1942:     sctx.shift_hi   = 1.;
1943:   }
1944:   sctx.shift_amount = 0;
1945:   sctx.nshift       = 0;

1947:   ISGetIndices(isrow,&r);
1948:   ISGetIndices(iscol,&c);
1949:   ISGetIndices(isicol,&ic);
1950:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1951:   PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1952:   ics    = ic;
1953:   rtmp22 = rtmp11 + n;
1954:   rtmp33 = rtmp22 + n;

1956:   node_max = a->inode.node_count;
1957:   ns       = a->inode.size;
1958:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1960:   /* If max inode size > 3, split it into two inodes.*/
1961:   /* also map the inode sizes according to the ordering */
1962:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1963:   for (i=0,j=0; i<node_max; ++i,++j) {
1964:     if (ns[i]>3) {
1965:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1966:       ++j;
1967:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1968:     } else {
1969:       tmp_vec1[j] = ns[i];
1970:     }
1971:   }
1972:   /* Use the correct node_max */
1973:   node_max = j;

1975:   /* Now reorder the inode info based on mat re-ordering info */
1976:   /* First create a row -> inode_size_array_index map */
1977:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1978:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1979:   for (i=0,row=0; i<node_max; i++) {
1980:     nodesz = tmp_vec1[i];
1981:     for (j=0; j<nodesz; j++,row++) {
1982:       nsmap[row] = i;
1983:     }
1984:   }
1985:   /* Using nsmap, create a reordered ns structure */
1986:   for (i=0,j=0; i< node_max; i++) {
1987:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1988:     tmp_vec2[i] = nodesz;
1989:     j          += nodesz;
1990:   }
1991:   PetscFree(nsmap);
1992:   PetscFree(tmp_vec1);
1993:   /* Now use the correct ns */
1994:   ns = tmp_vec2;

1996:   do {
1997:     sctx.newshift = PETSC_FALSE;
1998:     /* Now loop over each block-row, and do the factorization */
1999:     for (i=0,row=0; i<node_max; i++) {
2000:       nodesz = ns[i];
2001:       nz     = bi[row+1] - bi[row];
2002:       bjtmp  = bj + bi[row];

2004:       switch (nodesz) {
2005:       case 1:
2006:         for  (j=0; j<nz; j++) {
2007:           idx         = bjtmp[j];
2008:           rtmp11[idx] = 0.0;
2009:         }

2011:         /* load in initial (unfactored row) */
2012:         idx    = r[row];
2013:         nz_tmp = ai[idx+1] - ai[idx];
2014:         ajtmp  = aj + ai[idx];
2015:         v1     = aa + ai[idx];

2017:         for (j=0; j<nz_tmp; j++) {
2018:           idx         = ics[ajtmp[j]];
2019:           rtmp11[idx] = v1[j];
2020:         }
2021:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2023:         prow = *bjtmp++;
2024:         while (prow < row) {
2025:           pc1 = rtmp11 + prow;
2026:           if (*pc1 != 0.0) {
2027:             pv     = ba + bd[prow];
2028:             pj     = nbj + bd[prow];
2029:             mul1   = *pc1 * *pv++;
2030:             *pc1   = mul1;
2031:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2032:             PetscLogFlops(1+2*nz_tmp);
2033:             for (j=0; j<nz_tmp; j++) {
2034:               tmp          = pv[j];
2035:               idx          = pj[j];
2036:               rtmp11[idx] -= mul1 * tmp;
2037:             }
2038:           }
2039:           prow = *bjtmp++;
2040:         }
2041:         pj  = bj + bi[row];
2042:         pc1 = ba + bi[row];

2044:         sctx.pv     = rtmp11[row];
2045:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2046:         rs          = 0.0;
2047:         for (j=0; j<nz; j++) {
2048:           idx    = pj[j];
2049:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2050:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2051:         }
2052:         sctx.rs = rs;
2053:         MatPivotCheck(A,info,&sctx,row);
2054:         if (sctx.newshift) goto endofwhile;
2055:         break;

2057:       case 2:
2058:         for (j=0; j<nz; j++) {
2059:           idx         = bjtmp[j];
2060:           rtmp11[idx] = 0.0;
2061:           rtmp22[idx] = 0.0;
2062:         }

2064:         /* load in initial (unfactored row) */
2065:         idx    = r[row];
2066:         nz_tmp = ai[idx+1] - ai[idx];
2067:         ajtmp  = aj + ai[idx];
2068:         v1     = aa + ai[idx];
2069:         v2     = aa + ai[idx+1];
2070:         for (j=0; j<nz_tmp; j++) {
2071:           idx         = ics[ajtmp[j]];
2072:           rtmp11[idx] = v1[j];
2073:           rtmp22[idx] = v2[j];
2074:         }
2075:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2076:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2078:         prow = *bjtmp++;
2079:         while (prow < row) {
2080:           pc1 = rtmp11 + prow;
2081:           pc2 = rtmp22 + prow;
2082:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2083:             pv   = ba + bd[prow];
2084:             pj   = nbj + bd[prow];
2085:             mul1 = *pc1 * *pv;
2086:             mul2 = *pc2 * *pv;
2087:             ++pv;
2088:             *pc1 = mul1;
2089:             *pc2 = mul2;

2091:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2092:             for (j=0; j<nz_tmp; j++) {
2093:               tmp          = pv[j];
2094:               idx          = pj[j];
2095:               rtmp11[idx] -= mul1 * tmp;
2096:               rtmp22[idx] -= mul2 * tmp;
2097:             }
2098:             PetscLogFlops(2+4*nz_tmp);
2099:           }
2100:           prow = *bjtmp++;
2101:         }

2103:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2104:         pc1 = rtmp11 + prow;
2105:         pc2 = rtmp22 + prow;

2107:         sctx.pv = *pc1;
2108:         pj      = bj + bi[prow];
2109:         rs      = 0.0;
2110:         for (j=0; j<nz; j++) {
2111:           idx = pj[j];
2112:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2113:         }
2114:         sctx.rs = rs;
2115:         MatPivotCheck(A,info,&sctx,row);
2116:         if (sctx.newshift) goto endofwhile;

2118:         if (*pc2 != 0.0) {
2119:           pj     = nbj + bd[prow];
2120:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2121:           *pc2   = mul2;
2122:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2123:           for (j=0; j<nz_tmp; j++) {
2124:             idx          = pj[j];
2125:             tmp          = rtmp11[idx];
2126:             rtmp22[idx] -= mul2 * tmp;
2127:           }
2128:           PetscLogFlops(1+2*nz_tmp);
2129:         }

2131:         pj  = bj + bi[row];
2132:         pc1 = ba + bi[row];
2133:         pc2 = ba + bi[row+1];

2135:         sctx.pv       = rtmp22[row+1];
2136:         rs            = 0.0;
2137:         rtmp11[row]   = 1.0/rtmp11[row];
2138:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2139:         /* copy row entries from dense representation to sparse */
2140:         for (j=0; j<nz; j++) {
2141:           idx    = pj[j];
2142:           pc1[j] = rtmp11[idx];
2143:           pc2[j] = rtmp22[idx];
2144:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2145:         }
2146:         sctx.rs = rs;
2147:         MatPivotCheck(A,info,&sctx,row+1);
2148:         if (sctx.newshift) goto endofwhile;
2149:         break;

2151:       case 3:
2152:         for  (j=0; j<nz; j++) {
2153:           idx         = bjtmp[j];
2154:           rtmp11[idx] = 0.0;
2155:           rtmp22[idx] = 0.0;
2156:           rtmp33[idx] = 0.0;
2157:         }
2158:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2159:         idx    = r[row];
2160:         nz_tmp = ai[idx+1] - ai[idx];
2161:         ajtmp  = aj + ai[idx];
2162:         v1     = aa + ai[idx];
2163:         v2     = aa + ai[idx+1];
2164:         v3     = aa + ai[idx+2];
2165:         for (j=0; j<nz_tmp; j++) {
2166:           idx         = ics[ajtmp[j]];
2167:           rtmp11[idx] = v1[j];
2168:           rtmp22[idx] = v2[j];
2169:           rtmp33[idx] = v3[j];
2170:         }
2171:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2172:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2173:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2175:         /* loop over all pivot row blocks above this row block */
2176:         prow = *bjtmp++;
2177:         while (prow < row) {
2178:           pc1 = rtmp11 + prow;
2179:           pc2 = rtmp22 + prow;
2180:           pc3 = rtmp33 + prow;
2181:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2182:             pv   = ba  + bd[prow];
2183:             pj   = nbj + bd[prow];
2184:             mul1 = *pc1 * *pv;
2185:             mul2 = *pc2 * *pv;
2186:             mul3 = *pc3 * *pv;
2187:             ++pv;
2188:             *pc1 = mul1;
2189:             *pc2 = mul2;
2190:             *pc3 = mul3;

2192:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2193:             /* update this row based on pivot row */
2194:             for (j=0; j<nz_tmp; j++) {
2195:               tmp          = pv[j];
2196:               idx          = pj[j];
2197:               rtmp11[idx] -= mul1 * tmp;
2198:               rtmp22[idx] -= mul2 * tmp;
2199:               rtmp33[idx] -= mul3 * tmp;
2200:             }
2201:             PetscLogFlops(3+6*nz_tmp);
2202:           }
2203:           prow = *bjtmp++;
2204:         }

2206:         /* Now take care of diagonal 3x3 block in this set of rows */
2207:         /* note: prow = row here */
2208:         pc1 = rtmp11 + prow;
2209:         pc2 = rtmp22 + prow;
2210:         pc3 = rtmp33 + prow;

2212:         sctx.pv = *pc1;
2213:         pj      = bj + bi[prow];
2214:         rs      = 0.0;
2215:         for (j=0; j<nz; j++) {
2216:           idx = pj[j];
2217:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2218:         }
2219:         sctx.rs = rs;
2220:         MatPivotCheck(A,info,&sctx,row);
2221:         if (sctx.newshift) goto endofwhile;

2223:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2224:           mul2   = (*pc2)/(*pc1);
2225:           mul3   = (*pc3)/(*pc1);
2226:           *pc2   = mul2;
2227:           *pc3   = mul3;
2228:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2229:           pj     = nbj + bd[prow];
2230:           for (j=0; j<nz_tmp; j++) {
2231:             idx          = pj[j];
2232:             tmp          = rtmp11[idx];
2233:             rtmp22[idx] -= mul2 * tmp;
2234:             rtmp33[idx] -= mul3 * tmp;
2235:           }
2236:           PetscLogFlops(2+4*nz_tmp);
2237:         }
2238:         ++prow;

2240:         pc2     = rtmp22 + prow;
2241:         pc3     = rtmp33 + prow;
2242:         sctx.pv = *pc2;
2243:         pj      = bj + bi[prow];
2244:         rs      = 0.0;
2245:         for (j=0; j<nz; j++) {
2246:           idx = pj[j];
2247:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2248:         }
2249:         sctx.rs = rs;
2250:         MatPivotCheck(A,info,&sctx,row+1);
2251:         if (sctx.newshift) goto endofwhile;

2253:         if (*pc3 != 0.0) {
2254:           mul3   = (*pc3)/(*pc2);
2255:           *pc3   = mul3;
2256:           pj     = nbj + bd[prow];
2257:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2258:           for (j=0; j<nz_tmp; j++) {
2259:             idx          = pj[j];
2260:             tmp          = rtmp22[idx];
2261:             rtmp33[idx] -= mul3 * tmp;
2262:           }
2263:           PetscLogFlops(1+2*nz_tmp);
2264:         }

2266:         pj  = bj + bi[row];
2267:         pc1 = ba + bi[row];
2268:         pc2 = ba + bi[row+1];
2269:         pc3 = ba + bi[row+2];

2271:         sctx.pv       = rtmp33[row+2];
2272:         rs            = 0.0;
2273:         rtmp11[row]   = 1.0/rtmp11[row];
2274:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2275:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2276:         /* copy row entries from dense representation to sparse */
2277:         for (j=0; j<nz; j++) {
2278:           idx    = pj[j];
2279:           pc1[j] = rtmp11[idx];
2280:           pc2[j] = rtmp22[idx];
2281:           pc3[j] = rtmp33[idx];
2282:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2283:         }

2285:         sctx.rs = rs;
2286:         MatPivotCheck(A,info,&sctx,row+2);
2287:         if (sctx.newshift) goto endofwhile;
2288:         break;

2290:       default:
2291:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2292:       }
2293:       row += nodesz;                 /* Update the row */
2294:     }
2295: endofwhile:;
2296:   } while (sctx.newshift);
2297:   PetscFree(rtmp11);
2298:   PetscFree(tmp_vec2);
2299:   ISRestoreIndices(isicol,&ic);
2300:   ISRestoreIndices(isrow,&r);
2301:   ISRestoreIndices(iscol,&c);

2303:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2304:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2305:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2306:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2307:   C->assembled              = PETSC_TRUE;
2308:   C->preallocated           = PETSC_TRUE;
2309:   if (sctx.nshift) {
2310:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2311:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2312:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2313:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2314:     }
2315:   }
2316:   PetscLogFlops(C->cmap->n);
2317:   Mat_CheckInode(C,PETSC_FALSE);
2318:   return(0);
2319: }


2322: /* ----------------------------------------------------------- */
2325: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2326: {
2327:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2328:   IS                iscol = a->col,isrow = a->row;
2329:   PetscErrorCode    ierr;
2330:   const PetscInt    *r,*c,*rout,*cout;
2331:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2332:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2333:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2334:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2335:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2336:   const PetscScalar *b;

2339:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2340:   node_max = a->inode.node_count;
2341:   ns       = a->inode.size;     /* Node Size array */

2343:   VecGetArrayRead(bb,&b);
2344:   VecGetArray(xx,&x);
2345:   tmp  = a->solve_work;

2347:   ISGetIndices(isrow,&rout); r = rout;
2348:   ISGetIndices(iscol,&cout); c = cout;

2350:   /* forward solve the lower triangular */
2351:   tmps = tmp;
2352:   aa   = a_a;
2353:   aj   = a_j;
2354:   ad   = a->diag;

2356:   for (i = 0,row = 0; i< node_max; ++i) {
2357:     nsz = ns[i];
2358:     aii = ai[row];
2359:     v1  = aa + aii;
2360:     vi  = aj + aii;
2361:     nz  = ai[row+1]- ai[row];

2363:     if (i < node_max-1) {
2364:       /* Prefetch the indices for the next block */
2365:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2366:       /* Prefetch the data for the next block */
2367:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2368:     }

2370:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2371:     case 1:
2372:       sum1 = b[r[row]];
2373:       for (j=0; j<nz-1; j+=2) {
2374:         i0    = vi[j];
2375:         i1    = vi[j+1];
2376:         tmp0  = tmps[i0];
2377:         tmp1  = tmps[i1];
2378:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2379:       }
2380:       if (j == nz-1) {
2381:         tmp0  = tmps[vi[j]];
2382:         sum1 -= v1[j]*tmp0;
2383:       }
2384:       tmp[row++]=sum1;
2385:       break;
2386:     case 2:
2387:       sum1 = b[r[row]];
2388:       sum2 = b[r[row+1]];
2389:       v2   = aa + ai[row+1];

2391:       for (j=0; j<nz-1; j+=2) {
2392:         i0    = vi[j];
2393:         i1    = vi[j+1];
2394:         tmp0  = tmps[i0];
2395:         tmp1  = tmps[i1];
2396:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2397:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2398:       }
2399:       if (j == nz-1) {
2400:         tmp0  = tmps[vi[j]];
2401:         sum1 -= v1[j] *tmp0;
2402:         sum2 -= v2[j] *tmp0;
2403:       }
2404:       sum2     -= v2[nz] * sum1;
2405:       tmp[row++]=sum1;
2406:       tmp[row++]=sum2;
2407:       break;
2408:     case 3:
2409:       sum1 = b[r[row]];
2410:       sum2 = b[r[row+1]];
2411:       sum3 = b[r[row+2]];
2412:       v2   = aa + ai[row+1];
2413:       v3   = aa + ai[row+2];

2415:       for (j=0; j<nz-1; j+=2) {
2416:         i0    = vi[j];
2417:         i1    = vi[j+1];
2418:         tmp0  = tmps[i0];
2419:         tmp1  = tmps[i1];
2420:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2421:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2422:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2423:       }
2424:       if (j == nz-1) {
2425:         tmp0  = tmps[vi[j]];
2426:         sum1 -= v1[j] *tmp0;
2427:         sum2 -= v2[j] *tmp0;
2428:         sum3 -= v3[j] *tmp0;
2429:       }
2430:       sum2     -= v2[nz] * sum1;
2431:       sum3     -= v3[nz] * sum1;
2432:       sum3     -= v3[nz+1] * sum2;
2433:       tmp[row++]=sum1;
2434:       tmp[row++]=sum2;
2435:       tmp[row++]=sum3;
2436:       break;

2438:     case 4:
2439:       sum1 = b[r[row]];
2440:       sum2 = b[r[row+1]];
2441:       sum3 = b[r[row+2]];
2442:       sum4 = b[r[row+3]];
2443:       v2   = aa + ai[row+1];
2444:       v3   = aa + ai[row+2];
2445:       v4   = aa + ai[row+3];

2447:       for (j=0; j<nz-1; j+=2) {
2448:         i0    = vi[j];
2449:         i1    = vi[j+1];
2450:         tmp0  = tmps[i0];
2451:         tmp1  = tmps[i1];
2452:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2453:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2454:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2455:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2456:       }
2457:       if (j == nz-1) {
2458:         tmp0  = tmps[vi[j]];
2459:         sum1 -= v1[j] *tmp0;
2460:         sum2 -= v2[j] *tmp0;
2461:         sum3 -= v3[j] *tmp0;
2462:         sum4 -= v4[j] *tmp0;
2463:       }
2464:       sum2 -= v2[nz] * sum1;
2465:       sum3 -= v3[nz] * sum1;
2466:       sum4 -= v4[nz] * sum1;
2467:       sum3 -= v3[nz+1] * sum2;
2468:       sum4 -= v4[nz+1] * sum2;
2469:       sum4 -= v4[nz+2] * sum3;

2471:       tmp[row++]=sum1;
2472:       tmp[row++]=sum2;
2473:       tmp[row++]=sum3;
2474:       tmp[row++]=sum4;
2475:       break;
2476:     case 5:
2477:       sum1 = b[r[row]];
2478:       sum2 = b[r[row+1]];
2479:       sum3 = b[r[row+2]];
2480:       sum4 = b[r[row+3]];
2481:       sum5 = b[r[row+4]];
2482:       v2   = aa + ai[row+1];
2483:       v3   = aa + ai[row+2];
2484:       v4   = aa + ai[row+3];
2485:       v5   = aa + ai[row+4];

2487:       for (j=0; j<nz-1; j+=2) {
2488:         i0    = vi[j];
2489:         i1    = vi[j+1];
2490:         tmp0  = tmps[i0];
2491:         tmp1  = tmps[i1];
2492:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2493:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2494:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2495:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2496:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2497:       }
2498:       if (j == nz-1) {
2499:         tmp0  = tmps[vi[j]];
2500:         sum1 -= v1[j] *tmp0;
2501:         sum2 -= v2[j] *tmp0;
2502:         sum3 -= v3[j] *tmp0;
2503:         sum4 -= v4[j] *tmp0;
2504:         sum5 -= v5[j] *tmp0;
2505:       }

2507:       sum2 -= v2[nz] * sum1;
2508:       sum3 -= v3[nz] * sum1;
2509:       sum4 -= v4[nz] * sum1;
2510:       sum5 -= v5[nz] * sum1;
2511:       sum3 -= v3[nz+1] * sum2;
2512:       sum4 -= v4[nz+1] * sum2;
2513:       sum5 -= v5[nz+1] * sum2;
2514:       sum4 -= v4[nz+2] * sum3;
2515:       sum5 -= v5[nz+2] * sum3;
2516:       sum5 -= v5[nz+3] * sum4;

2518:       tmp[row++]=sum1;
2519:       tmp[row++]=sum2;
2520:       tmp[row++]=sum3;
2521:       tmp[row++]=sum4;
2522:       tmp[row++]=sum5;
2523:       break;
2524:     default:
2525:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2526:     }
2527:   }
2528:   /* backward solve the upper triangular */
2529:   for (i=node_max -1,row = n-1; i>=0; i--) {
2530:     nsz = ns[i];
2531:     aii = ad[row+1] + 1;
2532:     v1  = aa + aii;
2533:     vi  = aj + aii;
2534:     nz  = ad[row]- ad[row+1] - 1;

2536:     if (i > 0) {
2537:       /* Prefetch the indices for the next block */
2538:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2539:       /* Prefetch the data for the next block */
2540:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541:     }

2543:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2544:     case 1:
2545:       sum1 = tmp[row];

2547:       for (j=0; j<nz-1; j+=2) {
2548:         i0    = vi[j];
2549:         i1    = vi[j+1];
2550:         tmp0  = tmps[i0];
2551:         tmp1  = tmps[i1];
2552:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2553:       }
2554:       if (j == nz-1) {
2555:         tmp0  = tmps[vi[j]];
2556:         sum1 -= v1[j]*tmp0;
2557:       }
2558:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2559:       break;
2560:     case 2:
2561:       sum1 = tmp[row];
2562:       sum2 = tmp[row-1];
2563:       v2   = aa + ad[row] + 1;
2564:       for (j=0; j<nz-1; j+=2) {
2565:         i0    = vi[j];
2566:         i1    = vi[j+1];
2567:         tmp0  = tmps[i0];
2568:         tmp1  = tmps[i1];
2569:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2570:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2571:       }
2572:       if (j == nz-1) {
2573:         tmp0  = tmps[vi[j]];
2574:         sum1 -= v1[j]* tmp0;
2575:         sum2 -= v2[j+1]* tmp0;
2576:       }

2578:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2579:       sum2     -= v2[0] * tmp0;
2580:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2581:       break;
2582:     case 3:
2583:       sum1 = tmp[row];
2584:       sum2 = tmp[row -1];
2585:       sum3 = tmp[row -2];
2586:       v2   = aa + ad[row] + 1;
2587:       v3   = aa + ad[row -1] + 1;
2588:       for (j=0; j<nz-1; j+=2) {
2589:         i0    = vi[j];
2590:         i1    = vi[j+1];
2591:         tmp0  = tmps[i0];
2592:         tmp1  = tmps[i1];
2593:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2594:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2595:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2596:       }
2597:       if (j== nz-1) {
2598:         tmp0  = tmps[vi[j]];
2599:         sum1 -= v1[j] * tmp0;
2600:         sum2 -= v2[j+1] * tmp0;
2601:         sum3 -= v3[j+2] * tmp0;
2602:       }
2603:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604:       sum2     -= v2[0]* tmp0;
2605:       sum3     -= v3[1] * tmp0;
2606:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2607:       sum3     -= v3[0]* tmp0;
2608:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2610:       break;
2611:     case 4:
2612:       sum1 = tmp[row];
2613:       sum2 = tmp[row -1];
2614:       sum3 = tmp[row -2];
2615:       sum4 = tmp[row -3];
2616:       v2   = aa + ad[row]+1;
2617:       v3   = aa + ad[row -1]+1;
2618:       v4   = aa + ad[row -2]+1;

2620:       for (j=0; j<nz-1; j+=2) {
2621:         i0    = vi[j];
2622:         i1    = vi[j+1];
2623:         tmp0  = tmps[i0];
2624:         tmp1  = tmps[i1];
2625:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2626:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2627:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2628:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2629:       }
2630:       if (j== nz-1) {
2631:         tmp0  = tmps[vi[j]];
2632:         sum1 -= v1[j] * tmp0;
2633:         sum2 -= v2[j+1] * tmp0;
2634:         sum3 -= v3[j+2] * tmp0;
2635:         sum4 -= v4[j+3] * tmp0;
2636:       }

2638:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2639:       sum2     -= v2[0] * tmp0;
2640:       sum3     -= v3[1] * tmp0;
2641:       sum4     -= v4[2] * tmp0;
2642:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2643:       sum3     -= v3[0] * tmp0;
2644:       sum4     -= v4[1] * tmp0;
2645:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2646:       sum4     -= v4[0] * tmp0;
2647:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2648:       break;
2649:     case 5:
2650:       sum1 = tmp[row];
2651:       sum2 = tmp[row -1];
2652:       sum3 = tmp[row -2];
2653:       sum4 = tmp[row -3];
2654:       sum5 = tmp[row -4];
2655:       v2   = aa + ad[row]+1;
2656:       v3   = aa + ad[row -1]+1;
2657:       v4   = aa + ad[row -2]+1;
2658:       v5   = aa + ad[row -3]+1;
2659:       for (j=0; j<nz-1; j+=2) {
2660:         i0    = vi[j];
2661:         i1    = vi[j+1];
2662:         tmp0  = tmps[i0];
2663:         tmp1  = tmps[i1];
2664:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2665:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2666:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2667:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2668:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2669:       }
2670:       if (j==nz-1) {
2671:         tmp0  = tmps[vi[j]];
2672:         sum1 -= v1[j] * tmp0;
2673:         sum2 -= v2[j+1] * tmp0;
2674:         sum3 -= v3[j+2] * tmp0;
2675:         sum4 -= v4[j+3] * tmp0;
2676:         sum5 -= v5[j+4] * tmp0;
2677:       }

2679:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2680:       sum2     -= v2[0] * tmp0;
2681:       sum3     -= v3[1] * tmp0;
2682:       sum4     -= v4[2] * tmp0;
2683:       sum5     -= v5[3] * tmp0;
2684:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2685:       sum3     -= v3[0] * tmp0;
2686:       sum4     -= v4[1] * tmp0;
2687:       sum5     -= v5[2] * tmp0;
2688:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2689:       sum4     -= v4[0] * tmp0;
2690:       sum5     -= v5[1] * tmp0;
2691:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2692:       sum5     -= v5[0] * tmp0;
2693:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2694:       break;
2695:     default:
2696:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2697:     }
2698:   }
2699:   ISRestoreIndices(isrow,&rout);
2700:   ISRestoreIndices(iscol,&cout);
2701:   VecRestoreArrayRead(bb,&b);
2702:   VecRestoreArray(xx,&x);
2703:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2704:   return(0);
2705: }


2708: /*
2709:      Makes a longer coloring[] array and calls the usual code with that
2710: */
2713: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2714: {
2715:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2716:   PetscErrorCode  ierr;
2717:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2718:   PetscInt        *colorused,i;
2719:   ISColoringValue *newcolor;

2722:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2723:   /* loop over inodes, marking a color for each column*/
2724:   row = 0;
2725:   for (i=0; i<m; i++) {
2726:     for (j=0; j<ns[i]; j++) {
2727:       newcolor[row++] = coloring[i] + j*ncolors;
2728:     }
2729:   }

2731:   /* eliminate unneeded colors */
2732:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2733:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2734:   for (i=0; i<n; i++) {
2735:     colorused[newcolor[i]] = 1;
2736:   }

2738:   for (i=1; i<5*ncolors; i++) {
2739:     colorused[i] += colorused[i-1];
2740:   }
2741:   ncolors = colorused[5*ncolors-1];
2742:   for (i=0; i<n; i++) {
2743:     newcolor[i] = colorused[newcolor[i]]-1;
2744:   }
2745:   PetscFree(colorused);
2746:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);
2747:   PetscFree(coloring);
2748:   return(0);
2749: }

2751: #include <petsc-private/kernels/blockinvert.h>

2755: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2756: {
2757:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2758:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2759:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2760:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2761:   const MatScalar   *v = a->a,*v1,*v2,*v3,*v4,*v5;
2762:   const PetscScalar *xb, *b;
2763:   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2764:   PetscErrorCode    ierr;
2765:   PetscInt          n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766:   PetscInt          *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];

2769:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2772:   if (!a->inode.ibdiagvalid) {
2773:     if (!a->inode.ibdiag) {
2774:       /* calculate space needed for diagonal blocks */
2775:       for (i=0; i<m; i++) {
2776:         cnt += sizes[i]*sizes[i];
2777:       }
2778:       a->inode.bdiagsize = cnt;

2780:       PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2781:     }

2783:     /* copy over the diagonal blocks and invert them */
2784:     ibdiag = a->inode.ibdiag;
2785:     bdiag  = a->inode.bdiag;
2786:     cnt    = 0;
2787:     for (i=0, row = 0; i<m; i++) {
2788:       for (j=0; j<sizes[i]; j++) {
2789:         for (k=0; k<sizes[i]; k++) {
2790:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791:         }
2792:       }
2793:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));

2795:       switch (sizes[i]) {
2796:       case 1:
2797:         /* Create matrix data structure */
2798:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2800:         break;
2801:       case 2:
2802:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2803:         break;
2804:       case 3:
2805:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2806:         break;
2807:       case 4:
2808:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2809:         break;
2810:       case 5:
2811:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2812:         break;
2813:       default:
2814:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815:       }
2816:       cnt += sizes[i]*sizes[i];
2817:       row += sizes[i];
2818:     }
2819:     a->inode.ibdiagvalid = PETSC_TRUE;
2820:   }
2821:   ibdiag = a->inode.ibdiag;
2822:   bdiag  = a->inode.bdiag;
2823:   t      = a->inode.ssor_work;

2825:   VecGetArray(xx,&x);
2826:   VecGetArrayRead(bb,&b);
2827:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2828:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2829:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2831:       for (i=0, row=0; i<m; i++) {
2832:         sz  = diag[row] - ii[row];
2833:         v1  = a->a + ii[row];
2834:         idx = a->j + ii[row];

2836:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2837:         switch (sizes[i]) {
2838:         case 1:

2840:           sum1 = b[row];
2841:           for (n = 0; n<sz-1; n+=2) {
2842:             i1    = idx[0];
2843:             i2    = idx[1];
2844:             idx  += 2;
2845:             tmp0  = x[i1];
2846:             tmp1  = x[i2];
2847:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2848:           }

2850:           if (n == sz-1) {
2851:             tmp0  = x[*idx];
2852:             sum1 -= *v1 * tmp0;
2853:           }
2854:           t[row]   = sum1;
2855:           x[row++] = sum1*(*ibdiag++);
2856:           break;
2857:         case 2:
2858:           v2   = a->a + ii[row+1];
2859:           sum1 = b[row];
2860:           sum2 = b[row+1];
2861:           for (n = 0; n<sz-1; n+=2) {
2862:             i1    = idx[0];
2863:             i2    = idx[1];
2864:             idx  += 2;
2865:             tmp0  = x[i1];
2866:             tmp1  = x[i2];
2867:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869:           }

2871:           if (n == sz-1) {
2872:             tmp0  = x[*idx];
2873:             sum1 -= v1[0] * tmp0;
2874:             sum2 -= v2[0] * tmp0;
2875:           }
2876:           t[row]   = sum1;
2877:           t[row+1] = sum2;
2878:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2879:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2880:           ibdiag  += 4;
2881:           break;
2882:         case 3:
2883:           v2   = a->a + ii[row+1];
2884:           v3   = a->a + ii[row+2];
2885:           sum1 = b[row];
2886:           sum2 = b[row+1];
2887:           sum3 = b[row+2];
2888:           for (n = 0; n<sz-1; n+=2) {
2889:             i1    = idx[0];
2890:             i2    = idx[1];
2891:             idx  += 2;
2892:             tmp0  = x[i1];
2893:             tmp1  = x[i2];
2894:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2895:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2896:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2897:           }

2899:           if (n == sz-1) {
2900:             tmp0  = x[*idx];
2901:             sum1 -= v1[0] * tmp0;
2902:             sum2 -= v2[0] * tmp0;
2903:             sum3 -= v3[0] * tmp0;
2904:           }
2905:           t[row]   = sum1;
2906:           t[row+1] = sum2;
2907:           t[row+2] = sum3;
2908:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2909:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2910:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2911:           ibdiag  += 9;
2912:           break;
2913:         case 4:
2914:           v2   = a->a + ii[row+1];
2915:           v3   = a->a + ii[row+2];
2916:           v4   = a->a + ii[row+3];
2917:           sum1 = b[row];
2918:           sum2 = b[row+1];
2919:           sum3 = b[row+2];
2920:           sum4 = b[row+3];
2921:           for (n = 0; n<sz-1; n+=2) {
2922:             i1    = idx[0];
2923:             i2    = idx[1];
2924:             idx  += 2;
2925:             tmp0  = x[i1];
2926:             tmp1  = x[i2];
2927:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2928:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2929:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2930:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2931:           }

2933:           if (n == sz-1) {
2934:             tmp0  = x[*idx];
2935:             sum1 -= v1[0] * tmp0;
2936:             sum2 -= v2[0] * tmp0;
2937:             sum3 -= v3[0] * tmp0;
2938:             sum4 -= v4[0] * tmp0;
2939:           }
2940:           t[row]   = sum1;
2941:           t[row+1] = sum2;
2942:           t[row+2] = sum3;
2943:           t[row+3] = sum4;
2944:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2945:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2946:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2947:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2948:           ibdiag  += 16;
2949:           break;
2950:         case 5:
2951:           v2   = a->a + ii[row+1];
2952:           v3   = a->a + ii[row+2];
2953:           v4   = a->a + ii[row+3];
2954:           v5   = a->a + ii[row+4];
2955:           sum1 = b[row];
2956:           sum2 = b[row+1];
2957:           sum3 = b[row+2];
2958:           sum4 = b[row+3];
2959:           sum5 = b[row+4];
2960:           for (n = 0; n<sz-1; n+=2) {
2961:             i1    = idx[0];
2962:             i2    = idx[1];
2963:             idx  += 2;
2964:             tmp0  = x[i1];
2965:             tmp1  = x[i2];
2966:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2967:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2968:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2969:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2970:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2971:           }

2973:           if (n == sz-1) {
2974:             tmp0  = x[*idx];
2975:             sum1 -= v1[0] * tmp0;
2976:             sum2 -= v2[0] * tmp0;
2977:             sum3 -= v3[0] * tmp0;
2978:             sum4 -= v4[0] * tmp0;
2979:             sum5 -= v5[0] * tmp0;
2980:           }
2981:           t[row]   = sum1;
2982:           t[row+1] = sum2;
2983:           t[row+2] = sum3;
2984:           t[row+3] = sum4;
2985:           t[row+4] = sum5;
2986:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2987:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2988:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2989:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2990:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2991:           ibdiag  += 25;
2992:           break;
2993:         default:
2994:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2995:         }
2996:       }

2998:       xb   = t;
2999:       PetscLogFlops(a->nz);
3000:     } else xb = b;
3001:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3003:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3004:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3005:         ibdiag -= sizes[i]*sizes[i];
3006:         sz      = ii[row+1] - diag[row] - 1;
3007:         v1      = a->a + diag[row] + 1;
3008:         idx     = a->j + diag[row] + 1;

3010:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3011:         switch (sizes[i]) {
3012:         case 1:

3014:           sum1 = xb[row];
3015:           for (n = 0; n<sz-1; n+=2) {
3016:             i1    = idx[0];
3017:             i2    = idx[1];
3018:             idx  += 2;
3019:             tmp0  = x[i1];
3020:             tmp1  = x[i2];
3021:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3022:           }

3024:           if (n == sz-1) {
3025:             tmp0  = x[*idx];
3026:             sum1 -= *v1*tmp0;
3027:           }
3028:           x[row--] = sum1*(*ibdiag);
3029:           break;

3031:         case 2:

3033:           sum1 = xb[row];
3034:           sum2 = xb[row-1];
3035:           /* note that sum1 is associated with the second of the two rows */
3036:           v2 = a->a + diag[row-1] + 2;
3037:           for (n = 0; n<sz-1; n+=2) {
3038:             i1    = idx[0];
3039:             i2    = idx[1];
3040:             idx  += 2;
3041:             tmp0  = x[i1];
3042:             tmp1  = x[i2];
3043:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3044:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3045:           }

3047:           if (n == sz-1) {
3048:             tmp0  = x[*idx];
3049:             sum1 -= *v1*tmp0;
3050:             sum2 -= *v2*tmp0;
3051:           }
3052:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3053:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3054:           break;
3055:         case 3:

3057:           sum1 = xb[row];
3058:           sum2 = xb[row-1];
3059:           sum3 = xb[row-2];
3060:           v2   = a->a + diag[row-1] + 2;
3061:           v3   = a->a + diag[row-2] + 3;
3062:           for (n = 0; n<sz-1; n+=2) {
3063:             i1    = idx[0];
3064:             i2    = idx[1];
3065:             idx  += 2;
3066:             tmp0  = x[i1];
3067:             tmp1  = x[i2];
3068:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3069:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3070:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3071:           }

3073:           if (n == sz-1) {
3074:             tmp0  = x[*idx];
3075:             sum1 -= *v1*tmp0;
3076:             sum2 -= *v2*tmp0;
3077:             sum3 -= *v3*tmp0;
3078:           }
3079:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3080:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3081:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3082:           break;
3083:         case 4:

3085:           sum1 = xb[row];
3086:           sum2 = xb[row-1];
3087:           sum3 = xb[row-2];
3088:           sum4 = xb[row-3];
3089:           v2   = a->a + diag[row-1] + 2;
3090:           v3   = a->a + diag[row-2] + 3;
3091:           v4   = a->a + diag[row-3] + 4;
3092:           for (n = 0; n<sz-1; n+=2) {
3093:             i1    = idx[0];
3094:             i2    = idx[1];
3095:             idx  += 2;
3096:             tmp0  = x[i1];
3097:             tmp1  = x[i2];
3098:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3101:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3102:           }

3104:           if (n == sz-1) {
3105:             tmp0  = x[*idx];
3106:             sum1 -= *v1*tmp0;
3107:             sum2 -= *v2*tmp0;
3108:             sum3 -= *v3*tmp0;
3109:             sum4 -= *v4*tmp0;
3110:           }
3111:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3112:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3113:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3114:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3115:           break;
3116:         case 5:

3118:           sum1 = xb[row];
3119:           sum2 = xb[row-1];
3120:           sum3 = xb[row-2];
3121:           sum4 = xb[row-3];
3122:           sum5 = xb[row-4];
3123:           v2   = a->a + diag[row-1] + 2;
3124:           v3   = a->a + diag[row-2] + 3;
3125:           v4   = a->a + diag[row-3] + 4;
3126:           v5   = a->a + diag[row-4] + 5;
3127:           for (n = 0; n<sz-1; n+=2) {
3128:             i1    = idx[0];
3129:             i2    = idx[1];
3130:             idx  += 2;
3131:             tmp0  = x[i1];
3132:             tmp1  = x[i2];
3133:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3134:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3135:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3136:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3137:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3138:           }

3140:           if (n == sz-1) {
3141:             tmp0  = x[*idx];
3142:             sum1 -= *v1*tmp0;
3143:             sum2 -= *v2*tmp0;
3144:             sum3 -= *v3*tmp0;
3145:             sum4 -= *v4*tmp0;
3146:             sum5 -= *v5*tmp0;
3147:           }
3148:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3149:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3150:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3151:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3152:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3153:           break;
3154:         default:
3155:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3156:         }
3157:       }

3159:       PetscLogFlops(a->nz);
3160:     }
3161:     its--;
3162:   }
3163:   while (its--) {

3165:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3166:       ibdiag = a->inode.ibdiag;

3168:       for (i=0, row=0; i<m; i++) {
3169:         sz  = ii[row + 1] - ii[row];
3170:         v1  = a->a + ii[row];
3171:         idx = a->j + ii[row];

3173:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3174:         switch (sizes[i]) {
3175:         case 1:

3177:           sum1 = b[row];
3178:           for (n = 0; n<sz-1; n+=2) {
3179:             i1    = idx[0];
3180:             i2    = idx[1];
3181:             idx  += 2;
3182:             tmp0  = x[i1];
3183:             tmp1  = x[i2];
3184:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3185:           }

3187:           if (n == sz-1) {
3188:             tmp0  = x[*idx];
3189:             sum1 -= *v1 * tmp0;
3190:           }

3192:           /* in MatSOR_SeqAIJ this line would be
3193:            *
3194:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3195:            *
3196:            * but omega == 1, so this becomes
3197:            *
3198:            * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3199:            *
3200:            * but bdiag and ibdiag cancel each other, so we can change this
3201:            * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3202:            * block sizes as well
3203:            */
3204:           x[row++] += sum1*(*ibdiag++);
3205:           break;
3206:         case 2:
3207:           v2   = a->a + ii[row+1];
3208:           sum1 = b[row];
3209:           sum2 = b[row+1];
3210:           for (n = 0; n<sz-1; n+=2) {
3211:             i1    = idx[0];
3212:             i2    = idx[1];
3213:             idx  += 2;
3214:             tmp0  = x[i1];
3215:             tmp1  = x[i2];
3216:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3217:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3218:           }

3220:           if (n == sz-1) {
3221:             tmp0  = x[*idx];
3222:             sum1 -= v1[0] * tmp0;
3223:             sum2 -= v2[0] * tmp0;
3224:           }
3225:           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3226:           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3227:           ibdiag   += 4;
3228:           break;
3229:         case 3:
3230:           v2   = a->a + ii[row+1];
3231:           v3   = a->a + ii[row+2];
3232:           sum1 = b[row];
3233:           sum2 = b[row+1];
3234:           sum3 = b[row+2];
3235:           for (n = 0; n<sz-1; n+=2) {
3236:             i1    = idx[0];
3237:             i2    = idx[1];
3238:             idx  += 2;
3239:             tmp0  = x[i1];
3240:             tmp1  = x[i2];
3241:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3242:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3243:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3244:           }

3246:           if (n == sz-1) {
3247:             tmp0  = x[*idx];
3248:             sum1 -= v1[0] * tmp0;
3249:             sum2 -= v2[0] * tmp0;
3250:             sum3 -= v3[0] * tmp0;
3251:           }
3252:           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3253:           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3254:           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3255:           ibdiag   += 9;
3256:           break;
3257:         case 4:
3258:           v2   = a->a + ii[row+1];
3259:           v3   = a->a + ii[row+2];
3260:           v4   = a->a + ii[row+3];
3261:           sum1 = b[row];
3262:           sum2 = b[row+1];
3263:           sum3 = b[row+2];
3264:           sum4 = b[row+3];
3265:           for (n = 0; n<sz-1; n+=2) {
3266:             i1    = idx[0];
3267:             i2    = idx[1];
3268:             idx  += 2;
3269:             tmp0  = x[i1];
3270:             tmp1  = x[i2];
3271:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3272:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3273:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3274:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3275:           }

3277:           if (n == sz-1) {
3278:             tmp0  = x[*idx];
3279:             sum1 -= v1[0] * tmp0;
3280:             sum2 -= v2[0] * tmp0;
3281:             sum3 -= v3[0] * tmp0;
3282:             sum4 -= v4[0] * tmp0;
3283:           }
3284:           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3285:           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3286:           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3287:           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3288:           ibdiag   += 16;
3289:           break;
3290:         case 5:
3291:           v2   = a->a + ii[row+1];
3292:           v3   = a->a + ii[row+2];
3293:           v4   = a->a + ii[row+3];
3294:           v5   = a->a + ii[row+4];
3295:           sum1 = b[row];
3296:           sum2 = b[row+1];
3297:           sum3 = b[row+2];
3298:           sum4 = b[row+3];
3299:           sum5 = b[row+4];
3300:           for (n = 0; n<sz-1; n+=2) {
3301:             i1    = idx[0];
3302:             i2    = idx[1];
3303:             idx  += 2;
3304:             tmp0  = x[i1];
3305:             tmp1  = x[i2];
3306:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3311:           }

3313:           if (n == sz-1) {
3314:             tmp0  = x[*idx];
3315:             sum1 -= v1[0] * tmp0;
3316:             sum2 -= v2[0] * tmp0;
3317:             sum3 -= v3[0] * tmp0;
3318:             sum4 -= v4[0] * tmp0;
3319:             sum5 -= v5[0] * tmp0;
3320:           }
3321:           x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3322:           x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3323:           x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3324:           x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3325:           x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3326:           ibdiag   += 25;
3327:           break;
3328:         default:
3329:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3330:         }
3331:       }

3333:       PetscLogFlops(2.0*a->nz);
3334:     }
3335:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3337:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3338:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3339:         ibdiag -= sizes[i]*sizes[i];
3340:         sz      = ii[row+1] - ii[row];
3341:         v1      = a->a + ii[row];
3342:         idx     = a->j + ii[row];

3344:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3345:         switch (sizes[i]) {
3346:         case 1:

3348:           sum1 = b[row];
3349:           for (n = 0; n<sz-1; n+=2) {
3350:             i1    = idx[0];
3351:             i2    = idx[1];
3352:             idx  += 2;
3353:             tmp0  = x[i1];
3354:             tmp1  = x[i2];
3355:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3356:           }

3358:           if (n == sz-1) {
3359:             tmp0  = x[*idx];
3360:             sum1 -= *v1*tmp0;
3361:           }
3362:           x[row--] += sum1*(*ibdiag);
3363:           break;

3365:         case 2:

3367:           sum1 = b[row];
3368:           sum2 = b[row-1];
3369:           /* note that sum1 is associated with the second of the two rows */
3370:           v2 = a->a + ii[row - 1];
3371:           for (n = 0; n<sz-1; n+=2) {
3372:             i1    = idx[0];
3373:             i2    = idx[1];
3374:             idx  += 2;
3375:             tmp0  = x[i1];
3376:             tmp1  = x[i2];
3377:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3378:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3379:           }

3381:           if (n == sz-1) {
3382:             tmp0  = x[*idx];
3383:             sum1 -= *v1*tmp0;
3384:             sum2 -= *v2*tmp0;
3385:           }
3386:           x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3387:           x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3388:           break;
3389:         case 3:

3391:           sum1 = b[row];
3392:           sum2 = b[row-1];
3393:           sum3 = b[row-2];
3394:           v2   = a->a + ii[row-1];
3395:           v3   = a->a + ii[row-2];
3396:           for (n = 0; n<sz-1; n+=2) {
3397:             i1    = idx[0];
3398:             i2    = idx[1];
3399:             idx  += 2;
3400:             tmp0  = x[i1];
3401:             tmp1  = x[i2];
3402:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3403:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3404:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3405:           }

3407:           if (n == sz-1) {
3408:             tmp0  = x[*idx];
3409:             sum1 -= *v1*tmp0;
3410:             sum2 -= *v2*tmp0;
3411:             sum3 -= *v3*tmp0;
3412:           }
3413:           x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3414:           x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3415:           x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3416:           break;
3417:         case 4:

3419:           sum1 = b[row];
3420:           sum2 = b[row-1];
3421:           sum3 = b[row-2];
3422:           sum4 = b[row-3];
3423:           v2   = a->a + ii[row-1];
3424:           v3   = a->a + ii[row-2];
3425:           v4   = a->a + ii[row-3];
3426:           for (n = 0; n<sz-1; n+=2) {
3427:             i1    = idx[0];
3428:             i2    = idx[1];
3429:             idx  += 2;
3430:             tmp0  = x[i1];
3431:             tmp1  = x[i2];
3432:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3433:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3434:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3435:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3436:           }

3438:           if (n == sz-1) {
3439:             tmp0  = x[*idx];
3440:             sum1 -= *v1*tmp0;
3441:             sum2 -= *v2*tmp0;
3442:             sum3 -= *v3*tmp0;
3443:             sum4 -= *v4*tmp0;
3444:           }
3445:           x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3446:           x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3447:           x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3448:           x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3449:           break;
3450:         case 5:

3452:           sum1 = b[row];
3453:           sum2 = b[row-1];
3454:           sum3 = b[row-2];
3455:           sum4 = b[row-3];
3456:           sum5 = b[row-4];
3457:           v2   = a->a + ii[row-1];
3458:           v3   = a->a + ii[row-2];
3459:           v4   = a->a + ii[row-3];
3460:           v5   = a->a + ii[row-4];
3461:           for (n = 0; n<sz-1; n+=2) {
3462:             i1    = idx[0];
3463:             i2    = idx[1];
3464:             idx  += 2;
3465:             tmp0  = x[i1];
3466:             tmp1  = x[i2];
3467:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3468:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3469:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3470:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3471:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3472:           }

3474:           if (n == sz-1) {
3475:             tmp0  = x[*idx];
3476:             sum1 -= *v1*tmp0;
3477:             sum2 -= *v2*tmp0;
3478:             sum3 -= *v3*tmp0;
3479:             sum4 -= *v4*tmp0;
3480:             sum5 -= *v5*tmp0;
3481:           }
3482:           x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3483:           x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3484:           x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3485:           x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3486:           x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3487:           break;
3488:         default:
3489:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3490:         }
3491:       }

3493:       PetscLogFlops(2.0*a->nz);
3494:     }
3495:   }
3496:   if (flag & SOR_EISENSTAT) {
3497:     /*
3498:           Apply  (U + D)^-1  where D is now the block diagonal
3499:     */
3500:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3501:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3502:       ibdiag -= sizes[i]*sizes[i];
3503:       sz      = ii[row+1] - diag[row] - 1;
3504:       v1      = a->a + diag[row] + 1;
3505:       idx     = a->j + diag[row] + 1;
3506:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3507:       switch (sizes[i]) {
3508:       case 1:

3510:         sum1 = b[row];
3511:         for (n = 0; n<sz-1; n+=2) {
3512:           i1    = idx[0];
3513:           i2    = idx[1];
3514:           idx  += 2;
3515:           tmp0  = x[i1];
3516:           tmp1  = x[i2];
3517:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3518:         }

3520:         if (n == sz-1) {
3521:           tmp0  = x[*idx];
3522:           sum1 -= *v1*tmp0;
3523:         }
3524:         x[row] = sum1*(*ibdiag);row--;
3525:         break;

3527:       case 2:

3529:         sum1 = b[row];
3530:         sum2 = b[row-1];
3531:         /* note that sum1 is associated with the second of the two rows */
3532:         v2 = a->a + diag[row-1] + 2;
3533:         for (n = 0; n<sz-1; n+=2) {
3534:           i1    = idx[0];
3535:           i2    = idx[1];
3536:           idx  += 2;
3537:           tmp0  = x[i1];
3538:           tmp1  = x[i2];
3539:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3540:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3541:         }

3543:         if (n == sz-1) {
3544:           tmp0  = x[*idx];
3545:           sum1 -= *v1*tmp0;
3546:           sum2 -= *v2*tmp0;
3547:         }
3548:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3549:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3550:         row     -= 2;
3551:         break;
3552:       case 3:

3554:         sum1 = b[row];
3555:         sum2 = b[row-1];
3556:         sum3 = b[row-2];
3557:         v2   = a->a + diag[row-1] + 2;
3558:         v3   = a->a + diag[row-2] + 3;
3559:         for (n = 0; n<sz-1; n+=2) {
3560:           i1    = idx[0];
3561:           i2    = idx[1];
3562:           idx  += 2;
3563:           tmp0  = x[i1];
3564:           tmp1  = x[i2];
3565:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3566:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3567:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3568:         }

3570:         if (n == sz-1) {
3571:           tmp0  = x[*idx];
3572:           sum1 -= *v1*tmp0;
3573:           sum2 -= *v2*tmp0;
3574:           sum3 -= *v3*tmp0;
3575:         }
3576:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3577:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3578:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3579:         row     -= 3;
3580:         break;
3581:       case 4:

3583:         sum1 = b[row];
3584:         sum2 = b[row-1];
3585:         sum3 = b[row-2];
3586:         sum4 = b[row-3];
3587:         v2   = a->a + diag[row-1] + 2;
3588:         v3   = a->a + diag[row-2] + 3;
3589:         v4   = a->a + diag[row-3] + 4;
3590:         for (n = 0; n<sz-1; n+=2) {
3591:           i1    = idx[0];
3592:           i2    = idx[1];
3593:           idx  += 2;
3594:           tmp0  = x[i1];
3595:           tmp1  = x[i2];
3596:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3597:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3598:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3599:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3600:         }

3602:         if (n == sz-1) {
3603:           tmp0  = x[*idx];
3604:           sum1 -= *v1*tmp0;
3605:           sum2 -= *v2*tmp0;
3606:           sum3 -= *v3*tmp0;
3607:           sum4 -= *v4*tmp0;
3608:         }
3609:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3610:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3611:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3612:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3613:         row     -= 4;
3614:         break;
3615:       case 5:

3617:         sum1 = b[row];
3618:         sum2 = b[row-1];
3619:         sum3 = b[row-2];
3620:         sum4 = b[row-3];
3621:         sum5 = b[row-4];
3622:         v2   = a->a + diag[row-1] + 2;
3623:         v3   = a->a + diag[row-2] + 3;
3624:         v4   = a->a + diag[row-3] + 4;
3625:         v5   = a->a + diag[row-4] + 5;
3626:         for (n = 0; n<sz-1; n+=2) {
3627:           i1    = idx[0];
3628:           i2    = idx[1];
3629:           idx  += 2;
3630:           tmp0  = x[i1];
3631:           tmp1  = x[i2];
3632:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3634:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3635:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3636:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3637:         }

3639:         if (n == sz-1) {
3640:           tmp0  = x[*idx];
3641:           sum1 -= *v1*tmp0;
3642:           sum2 -= *v2*tmp0;
3643:           sum3 -= *v3*tmp0;
3644:           sum4 -= *v4*tmp0;
3645:           sum5 -= *v5*tmp0;
3646:         }
3647:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3648:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3649:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3650:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3651:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3652:         row     -= 5;
3653:         break;
3654:       default:
3655:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3656:       }
3657:     }
3658:     PetscLogFlops(a->nz);

3660:     /*
3661:            t = b - D x    where D is the block diagonal
3662:     */
3663:     cnt = 0;
3664:     for (i=0, row=0; i<m; i++) {
3665:       switch (sizes[i]) {
3666:       case 1:
3667:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3668:         break;
3669:       case 2:
3670:         x1       = x[row]; x2 = x[row+1];
3671:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3672:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3673:         t[row]   = b[row] - tmp1;
3674:         t[row+1] = b[row+1] - tmp2; row += 2;
3675:         cnt     += 4;
3676:         break;
3677:       case 3:
3678:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3679:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3680:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3681:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3682:         t[row]   = b[row] - tmp1;
3683:         t[row+1] = b[row+1] - tmp2;
3684:         t[row+2] = b[row+2] - tmp3; row += 3;
3685:         cnt     += 9;
3686:         break;
3687:       case 4:
3688:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3689:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3690:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3691:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3692:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3693:         t[row]   = b[row] - tmp1;
3694:         t[row+1] = b[row+1] - tmp2;
3695:         t[row+2] = b[row+2] - tmp3;
3696:         t[row+3] = b[row+3] - tmp4; row += 4;
3697:         cnt     += 16;
3698:         break;
3699:       case 5:
3700:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3701:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3702:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3703:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3704:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3705:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3706:         t[row]   = b[row] - tmp1;
3707:         t[row+1] = b[row+1] - tmp2;
3708:         t[row+2] = b[row+2] - tmp3;
3709:         t[row+3] = b[row+3] - tmp4;
3710:         t[row+4] = b[row+4] - tmp5;row += 5;
3711:         cnt     += 25;
3712:         break;
3713:       default:
3714:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3715:       }
3716:     }
3717:     PetscLogFlops(m);



3721:     /*
3722:           Apply (L + D)^-1 where D is the block diagonal
3723:     */
3724:     for (i=0, row=0; i<m; i++) {
3725:       sz  = diag[row] - ii[row];
3726:       v1  = a->a + ii[row];
3727:       idx = a->j + ii[row];
3728:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3729:       switch (sizes[i]) {
3730:       case 1:

3732:         sum1 = t[row];
3733:         for (n = 0; n<sz-1; n+=2) {
3734:           i1    = idx[0];
3735:           i2    = idx[1];
3736:           idx  += 2;
3737:           tmp0  = t[i1];
3738:           tmp1  = t[i2];
3739:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3740:         }

3742:         if (n == sz-1) {
3743:           tmp0  = t[*idx];
3744:           sum1 -= *v1 * tmp0;
3745:         }
3746:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3747:         break;
3748:       case 2:
3749:         v2   = a->a + ii[row+1];
3750:         sum1 = t[row];
3751:         sum2 = t[row+1];
3752:         for (n = 0; n<sz-1; n+=2) {
3753:           i1    = idx[0];
3754:           i2    = idx[1];
3755:           idx  += 2;
3756:           tmp0  = t[i1];
3757:           tmp1  = t[i2];
3758:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3759:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3760:         }

3762:         if (n == sz-1) {
3763:           tmp0  = t[*idx];
3764:           sum1 -= v1[0] * tmp0;
3765:           sum2 -= v2[0] * tmp0;
3766:         }
3767:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3768:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3769:         ibdiag   += 4; row += 2;
3770:         break;
3771:       case 3:
3772:         v2   = a->a + ii[row+1];
3773:         v3   = a->a + ii[row+2];
3774:         sum1 = t[row];
3775:         sum2 = t[row+1];
3776:         sum3 = t[row+2];
3777:         for (n = 0; n<sz-1; n+=2) {
3778:           i1    = idx[0];
3779:           i2    = idx[1];
3780:           idx  += 2;
3781:           tmp0  = t[i1];
3782:           tmp1  = t[i2];
3783:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3784:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3785:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3786:         }

3788:         if (n == sz-1) {
3789:           tmp0  = t[*idx];
3790:           sum1 -= v1[0] * tmp0;
3791:           sum2 -= v2[0] * tmp0;
3792:           sum3 -= v3[0] * tmp0;
3793:         }
3794:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3795:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3796:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3797:         ibdiag   += 9; row += 3;
3798:         break;
3799:       case 4:
3800:         v2   = a->a + ii[row+1];
3801:         v3   = a->a + ii[row+2];
3802:         v4   = a->a + ii[row+3];
3803:         sum1 = t[row];
3804:         sum2 = t[row+1];
3805:         sum3 = t[row+2];
3806:         sum4 = t[row+3];
3807:         for (n = 0; n<sz-1; n+=2) {
3808:           i1    = idx[0];
3809:           i2    = idx[1];
3810:           idx  += 2;
3811:           tmp0  = t[i1];
3812:           tmp1  = t[i2];
3813:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3814:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3815:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3816:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3817:         }

3819:         if (n == sz-1) {
3820:           tmp0  = t[*idx];
3821:           sum1 -= v1[0] * tmp0;
3822:           sum2 -= v2[0] * tmp0;
3823:           sum3 -= v3[0] * tmp0;
3824:           sum4 -= v4[0] * tmp0;
3825:         }
3826:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3827:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3828:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3829:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3830:         ibdiag   += 16; row += 4;
3831:         break;
3832:       case 5:
3833:         v2   = a->a + ii[row+1];
3834:         v3   = a->a + ii[row+2];
3835:         v4   = a->a + ii[row+3];
3836:         v5   = a->a + ii[row+4];
3837:         sum1 = t[row];
3838:         sum2 = t[row+1];
3839:         sum3 = t[row+2];
3840:         sum4 = t[row+3];
3841:         sum5 = t[row+4];
3842:         for (n = 0; n<sz-1; n+=2) {
3843:           i1    = idx[0];
3844:           i2    = idx[1];
3845:           idx  += 2;
3846:           tmp0  = t[i1];
3847:           tmp1  = t[i2];
3848:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3849:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3850:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3851:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3852:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3853:         }

3855:         if (n == sz-1) {
3856:           tmp0  = t[*idx];
3857:           sum1 -= v1[0] * tmp0;
3858:           sum2 -= v2[0] * tmp0;
3859:           sum3 -= v3[0] * tmp0;
3860:           sum4 -= v4[0] * tmp0;
3861:           sum5 -= v5[0] * tmp0;
3862:         }
3863:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3864:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3865:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3866:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3867:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3868:         ibdiag   += 25; row += 5;
3869:         break;
3870:       default:
3871:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3872:       }
3873:     }
3874:     PetscLogFlops(a->nz);
3875:   }
3876:   VecRestoreArray(xx,&x);
3877:   VecRestoreArrayRead(bb,&b);
3878:   return(0);
3879: }

3883: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3884: {
3885:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3886:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3887:   const MatScalar   *bdiag = a->inode.bdiag;
3888:   const PetscScalar *b;
3889:   PetscErrorCode    ierr;
3890:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3891:   const PetscInt    *sizes = a->inode.size;

3894:   VecGetArray(xx,&x);
3895:   VecGetArrayRead(bb,&b);
3896:   cnt  = 0;
3897:   for (i=0, row=0; i<m; i++) {
3898:     switch (sizes[i]) {
3899:     case 1:
3900:       x[row] = b[row]*bdiag[cnt++];row++;
3901:       break;
3902:     case 2:
3903:       x1       = b[row]; x2 = b[row+1];
3904:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3905:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3906:       x[row++] = tmp1;
3907:       x[row++] = tmp2;
3908:       cnt     += 4;
3909:       break;
3910:     case 3:
3911:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3912:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3913:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3914:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3915:       x[row++] = tmp1;
3916:       x[row++] = tmp2;
3917:       x[row++] = tmp3;
3918:       cnt     += 9;
3919:       break;
3920:     case 4:
3921:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3922:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3923:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3924:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3925:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3926:       x[row++] = tmp1;
3927:       x[row++] = tmp2;
3928:       x[row++] = tmp3;
3929:       x[row++] = tmp4;
3930:       cnt     += 16;
3931:       break;
3932:     case 5:
3933:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3934:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3935:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3936:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3937:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3938:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3939:       x[row++] = tmp1;
3940:       x[row++] = tmp2;
3941:       x[row++] = tmp3;
3942:       x[row++] = tmp4;
3943:       x[row++] = tmp5;
3944:       cnt     += 25;
3945:       break;
3946:     default:
3947:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3948:     }
3949:   }
3950:   PetscLogFlops(2*cnt);
3951:   VecRestoreArray(xx,&x);
3952:   VecRestoreArrayRead(bb,&b);
3953:   return(0);
3954: }

3956: /*
3957:     samestructure indicates that the matrix has not changed its nonzero structure so we
3958:     do not need to recompute the inodes
3959: */
3962: PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
3963: {
3964:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3966:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3967:   PetscBool      flag;

3970:   if (!a->inode.use) return(0);
3971:   if (a->inode.checked && samestructure) return(0);


3974:   m = A->rmap->n;
3975:   if (a->inode.size) ns = a->inode.size;
3976:   else {
3977:     PetscMalloc((m+1)*sizeof(PetscInt),&ns);
3978:   }

3980:   i          = 0;
3981:   node_count = 0;
3982:   idx        = a->j;
3983:   ii         = a->i;
3984:   while (i < m) {                /* For each row */
3985:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3986:     /* Limits the number of elements in a node to 'a->inode.limit' */
3987:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3988:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
3989:       if (nzy != nzx) break;
3990:       idy += nzx;              /* Same nonzero pattern */
3991:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3992:       if (!flag) break;
3993:     }
3994:     ns[node_count++] = blk_size;
3995:     idx             += blk_size*nzx;
3996:     i                = j;
3997:   }
3998:   /* If not enough inodes found,, do not use inode version of the routines */
3999:   if (!m || node_count > .8*m) {
4000:     PetscFree(ns);

4002:     a->inode.node_count       = 0;
4003:     a->inode.size             = NULL;
4004:     a->inode.use              = PETSC_FALSE;
4005:     A->ops->mult              = MatMult_SeqAIJ;
4006:     A->ops->sor               = MatSOR_SeqAIJ;
4007:     A->ops->multadd           = MatMultAdd_SeqAIJ;
4008:     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4009:     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4010:     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4011:     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4012:     A->ops->coloringpatch     = 0;
4013:     A->ops->multdiagonalblock = 0;

4015:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4016:   } else {
4017:     if (!A->factortype) {
4018:       A->ops->mult              = MatMult_SeqAIJ_Inode;
4019:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4020:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4021:       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4022:       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4023:       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4024:       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4025:       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4026:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4027:     } else {
4028:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4029:     }
4030:     a->inode.node_count = node_count;
4031:     a->inode.size       = ns;
4032:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4033:   }
4034:   a->inode.checked = PETSC_TRUE;
4035:   return(0);
4036: }

4040: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4041: {
4042:   Mat            B =*C;
4043:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4045:   PetscInt       m=A->rmap->n;

4048:   c->inode.use       = a->inode.use;
4049:   c->inode.limit     = a->inode.limit;
4050:   c->inode.max_limit = a->inode.max_limit;
4051:   if (a->inode.size) {
4052:     PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);
4053:     c->inode.node_count = a->inode.node_count;
4054:     PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4055:     /* note the table of functions below should match that in Mat_CheckInode() */
4056:     if (!B->factortype) {
4057:       B->ops->mult              = MatMult_SeqAIJ_Inode;
4058:       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4059:       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4060:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4061:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4062:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4063:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4064:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4065:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4066:     } else {
4067:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4068:     }
4069:   } else {
4070:     c->inode.size       = 0;
4071:     c->inode.node_count = 0;
4072:   }
4073:   c->inode.ibdiagvalid = PETSC_FALSE;
4074:   c->inode.ibdiag      = 0;
4075:   c->inode.bdiag       = 0;
4076:   return(0);
4077: }

4081: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4082: {
4083:   PetscInt k, *vi;

4086:   vi = aj + ai[row];
4087:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4088:   vi        = aj + adiag[row];
4089:   cols[nzl] = vi[0];
4090:   vi        = aj + adiag[row+1]+1;
4091:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4092:   return(0);
4093: }
4094: /*
4095:    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4096:    Modified from Mat_CheckInode().

4098:    Input Parameters:
4099: +  Mat A - ILU or LU matrix factor
4100: -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4101:     do not need to recompute the inodes
4102: */
4105: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure)
4106: {
4107:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4109:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4110:   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4111:   PetscBool      flag;

4114:   if (!a->inode.use)                     return(0);
4115:   if (a->inode.checked && samestructure) return(0);

4117:   m = A->rmap->n;
4118:   if (a->inode.size) ns = a->inode.size;
4119:   else {
4120:     PetscMalloc((m+1)*sizeof(PetscInt),&ns);
4121:   }

4123:   i          = 0;
4124:   node_count = 0;
4125:   while (i < m) {                /* For each row */
4126:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4127:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4128:     nzx  = nzl1 + nzu1 + 1;
4129:     PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
4130:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4132:     /* Limits the number of elements in a node to 'a->inode.limit' */
4133:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4134:       nzl2 = ai[j+1] - ai[j];
4135:       nzu2 = adiag[j] - adiag[j+1] - 1;
4136:       nzy  = nzl2 + nzu2 + 1;
4137:       if (nzy != nzx) break;
4138:       PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
4139:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4140:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4141:       if (!flag) {PetscFree(cols2);break;}
4142:       PetscFree(cols2);
4143:     }
4144:     ns[node_count++] = blk_size;
4145:     PetscFree(cols1);
4146:     i                = j;
4147:   }
4148:   /* If not enough inodes found,, do not use inode version of the routines */
4149:   if (!m || node_count > .8*m) {
4150:     PetscFree(ns);

4152:     a->inode.node_count = 0;
4153:     a->inode.size       = NULL;
4154:     a->inode.use        = PETSC_FALSE;

4156:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4157:   } else {
4158:     A->ops->mult              = 0;
4159:     A->ops->sor               = 0;
4160:     A->ops->multadd           = 0;
4161:     A->ops->getrowij          = 0;
4162:     A->ops->restorerowij      = 0;
4163:     A->ops->getcolumnij       = 0;
4164:     A->ops->restorecolumnij   = 0;
4165:     A->ops->coloringpatch     = 0;
4166:     A->ops->multdiagonalblock = 0;
4167:     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4168:     a->inode.node_count       = node_count;
4169:     a->inode.size             = ns;

4171:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4172:   }
4173:   a->inode.checked = PETSC_TRUE;
4174:   return(0);
4175: }

4179: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4180: {
4181:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4184:   a->inode.ibdiagvalid = PETSC_FALSE;
4185:   return(0);
4186: }

4188: /*
4189:      This is really ugly. if inodes are used this replaces the
4190:   permutations with ones that correspond to rows/cols of the matrix
4191:   rather then inode blocks
4192: */
4195: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4196: {

4200:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4201:   return(0);
4202: }

4206: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4207: {
4208:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4210:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4211:   const PetscInt *ridx,*cidx;
4212:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4213:   PetscInt       nslim_col,*ns_col;
4214:   IS             ris = *rperm,cis = *cperm;

4217:   if (!a->inode.size) return(0); /* no inodes so return */
4218:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4220:   Mat_CreateColInode(A,&nslim_col,&ns_col);
4221:   PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);
4222:   PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);

4224:   ISGetIndices(ris,&ridx);
4225:   ISGetIndices(cis,&cidx);

4227:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4228:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4230:   /* Construct the permutations for rows*/
4231:   for (i=0,row = 0; i<nslim_row; ++i) {
4232:     indx      = ridx[i];
4233:     start_val = tns[indx];
4234:     end_val   = tns[indx + 1];
4235:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4236:   }

4238:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4239:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4241:   /* Construct permutations for columns */
4242:   for (i=0,col=0; i<nslim_col; ++i) {
4243:     indx      = cidx[i];
4244:     start_val = tns[indx];
4245:     end_val   = tns[indx + 1];
4246:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4247:   }

4249:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4250:   ISSetPermutation(*rperm);
4251:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4252:   ISSetPermutation(*cperm);

4254:   ISRestoreIndices(ris,&ridx);
4255:   ISRestoreIndices(cis,&cidx);

4257:   PetscFree(ns_col);
4258:   PetscFree2(permr,permc);
4259:   ISDestroy(&cis);
4260:   ISDestroy(&ris);
4261:   PetscFree(tns);
4262:   return(0);
4263: }

4267: /*@C
4268:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4270:    Not Collective

4272:    Input Parameter:
4273: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4275:    Output Parameter:
4276: +  node_count - no of inodes present in the matrix.
4277: .  sizes      - an array of size node_count,with sizes of each inode.
4278: -  limit      - the max size used to generate the inodes.

4280:    Level: advanced

4282:    Notes: This routine returns some internal storage information
4283:    of the matrix, it is intended to be used by advanced users.
4284:    It should be called after the matrix is assembled.
4285:    The contents of the sizes[] array should not be changed.
4286:    NULL may be passed for information not requested.

4288: .keywords: matrix, seqaij, get, inode

4290: .seealso: MatGetInfo()
4291: @*/
4292: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4293: {
4294:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4297:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4298:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4299:   if (f) {
4300:     (*f)(A,node_count,sizes,limit);
4301:   }
4302:   return(0);
4303: }

4307: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4308: {
4309:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4312:   if (node_count) *node_count = a->inode.node_count;
4313:   if (sizes)      *sizes      = a->inode.size;
4314:   if (limit)      *limit      = a->inode.limit;
4315:   return(0);
4316: }