Actual source code: inode.c

  2: /*
  3:   This file provides high performance routines for the Inode format (compressed sparse row)
  4:   by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>

  8: static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
  9: {
 10:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 12:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 15:   n = A->cmap->n;
 16:   m = A->rmap->n;
 17:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
 18:   ns_row = a->inode.size;

 20:   min_mn = (m < n) ? m : n;
 21:   if (!ns) {
 22:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
 23:     for (; count+1 < n; count++,i++) ;
 24:     if (count < n)  {
 25:       i++;
 26:     }
 27:     *size = i;
 28:     return(0);
 29:   }
 30:   PetscMalloc1(n+1,&ns_col);

 32:   /* Use the same row structure wherever feasible. */
 33:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 34:     ns_col[i] = ns_row[i];
 35:   }

 37:   /* if m < n; pad up the remainder with inode_limit */
 38:   for (; count+1 < n; count++,i++) {
 39:     ns_col[i] = 1;
 40:   }
 41:   /* The last node is the odd ball. padd it up with the remaining rows; */
 42:   if (count < n) {
 43:     ns_col[i] = n - count;
 44:     i++;
 45:   } else if (count > n) {
 46:     /* Adjust for the over estimation */
 47:     ns_col[i-1] += n - count;
 48:   }
 49:   *size = i;
 50:   *ns   = ns_col;
 51:   return(0);
 52: }

 54: /*
 55:       This builds symmetric version of nonzero structure,
 56: */
 57: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 58: {
 59:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
 61:   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
 62:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
 63:   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;

 66:   nslim_row = a->inode.node_count;
 67:   m         = A->rmap->n;
 68:   n         = A->cmap->n;
 69:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 70:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");

 72:   /* Use the row_inode as column_inode */
 73:   nslim_col = nslim_row;
 74:   ns_col    = ns_row;

 76:   /* allocate space for reformated inode structure */
 77:   PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);
 78:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 80:   for (i1=0,col=0; i1<nslim_col; ++i1) {
 81:     nsz = ns_col[i1];
 82:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
 83:   }
 84:   /* allocate space for row pointers */
 85:   PetscCalloc1(nslim_row+1,&ia);
 86:   *iia = ia;
 87:   PetscMalloc1(nslim_row+1,&work);

 89:   /* determine the number of columns in each row */
 90:   ia[0] = oshift;
 91:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {

 93:     j    = aj + ai[row] + ishift;
 94:     jmax = aj + ai[row+1] + ishift;
 95:     if (j==jmax) continue; /* empty row */
 96:     col  = *j++ + ishift;
 97:     i2   = tvc[col];
 98:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
 99:       ia[i1+1]++;
100:       ia[i2+1]++;
101:       i2++;                     /* Start col of next node */
102:       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
103:       i2 = tvc[col];
104:     }
105:     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
106:   }

108:   /* shift ia[i] to point to next row */
109:   for (i1=1; i1<nslim_row+1; i1++) {
110:     row        = ia[i1-1];
111:     ia[i1]    += row;
112:     work[i1-1] = row - oshift;
113:   }

115:   /* allocate space for column pointers */
116:   nz   = ia[nslim_row] + (!ishift);
117:   PetscMalloc1(nz,&ja);
118:   *jja = ja;

120:   /* loop over lower triangular part putting into ja */
121:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
122:     j    = aj + ai[row] + ishift;
123:     jmax = aj + ai[row+1] + ishift;
124:     if (j==jmax) continue; /* empty row */
125:     col  = *j++ + ishift;
126:     i2   = tvc[col];
127:     while (i2<i1 && j<jmax) {
128:       ja[work[i2]++] = i1 + oshift;
129:       ja[work[i1]++] = i2 + oshift;
130:       ++i2;
131:       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
132:       i2 = tvc[col];
133:     }
134:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

136:   }
137:   PetscFree(work);
138:   PetscFree2(tns,tvc);
139:   return(0);
140: }

142: /*
143:       This builds nonsymmetric version of nonzero structure,
144: */
145: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
146: {
147:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
149:   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
150:   PetscInt       *tns,*tvc,nsz,i1,i2;
151:   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;

154:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
155:   nslim_row = a->inode.node_count;
156:   n         = A->cmap->n;

158:   /* Create The column_inode for this matrix */
159:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

161:   /* allocate space for reformated column_inode structure */
162:   PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);
163:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

165:   for (i1=0,col=0; i1<nslim_col; ++i1) {
166:     nsz = ns_col[i1];
167:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
168:   }
169:   /* allocate space for row pointers */
170:   PetscCalloc1(nslim_row+1,&ia);
171:   *iia = ia;
172:   PetscMalloc1(nslim_row+1,&work);

174:   /* determine the number of columns in each row */
175:   ia[0] = oshift;
176:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
177:     j   = aj + ai[row] + ishift;
178:     nz  = ai[row+1] - ai[row];
179:     if (!nz) continue; /* empty row */
180:     col = *j++ + ishift;
181:     i2  = tvc[col];
182:     while (nz-- > 0) {           /* off-diagonal elemets */
183:       ia[i1+1]++;
184:       i2++;                     /* Start col of next node */
185:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
186:       if (nz > 0) i2 = tvc[col];
187:     }
188:   }

190:   /* shift ia[i] to point to next row */
191:   for (i1=1; i1<nslim_row+1; i1++) {
192:     row        = ia[i1-1];
193:     ia[i1]    += row;
194:     work[i1-1] = row - oshift;
195:   }

197:   /* allocate space for column pointers */
198:   nz   = ia[nslim_row] + (!ishift);
199:   PetscMalloc1(nz,&ja);
200:   *jja = ja;

202:   /* loop over matrix putting into ja */
203:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
204:     j   = aj + ai[row] + ishift;
205:     nz  = ai[row+1] - ai[row];
206:     if (!nz) continue; /* empty row */
207:     col = *j++ + ishift;
208:     i2  = tvc[col];
209:     while (nz-- > 0) {
210:       ja[work[i1]++] = i2 + oshift;
211:       ++i2;
212:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
213:       if (nz > 0) i2 = tvc[col];
214:     }
215:   }
216:   PetscFree(ns_col);
217:   PetscFree(work);
218:   PetscFree2(tns,tvc);
219:   return(0);
220: }

222: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
223: {
224:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

228:   *n = a->inode.node_count;
229:   if (!ia) return(0);
230:   if (!blockcompressed) {
231:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
232:   } else if (symmetric) {
233:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
234:   } else {
235:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
236:   }
237:   return(0);
238: }

240: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
241: {

245:   if (!ia) return(0);

247:   if (!blockcompressed) {
248:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
249:   } else {
250:     PetscFree(*ia);
251:     PetscFree(*ja);
252:   }
253:   return(0);
254: }

256: /* ----------------------------------------------------------- */

258: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
259: {
260:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
262:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
263:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

266:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
267:   nslim_row = a->inode.node_count;
268:   n         = A->cmap->n;

270:   /* Create The column_inode for this matrix */
271:   MatCreateColInode_Private(A,&nslim_col,&ns_col);

273:   /* allocate space for reformated column_inode structure */
274:   PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);
275:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

277:   for (i1=0,col=0; i1<nslim_col; ++i1) {
278:     nsz = ns_col[i1];
279:     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
280:   }
281:   /* allocate space for column pointers */
282:   PetscCalloc1(nslim_col+1,&ia);
283:   *iia = ia;
284:   PetscMalloc1(nslim_col+1,&work);

286:   /* determine the number of columns in each row */
287:   ia[0] = oshift;
288:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
289:     j   = aj + ai[row] + ishift;
290:     col = *j++ + ishift;
291:     i2  = tvc[col];
292:     nz  = ai[row+1] - ai[row];
293:     while (nz-- > 0) {           /* off-diagonal elemets */
294:       /* ia[i1+1]++; */
295:       ia[i2+1]++;
296:       i2++;
297:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
298:       if (nz > 0) i2 = tvc[col];
299:     }
300:   }

302:   /* shift ia[i] to point to next col */
303:   for (i1=1; i1<nslim_col+1; i1++) {
304:     col        = ia[i1-1];
305:     ia[i1]    += col;
306:     work[i1-1] = col - oshift;
307:   }

309:   /* allocate space for column pointers */
310:   nz   = ia[nslim_col] + (!ishift);
311:   PetscMalloc1(nz,&ja);
312:   *jja = ja;

314:   /* loop over matrix putting into ja */
315:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
316:     j   = aj + ai[row] + ishift;
317:     col = *j++ + ishift;
318:     i2  = tvc[col];
319:     nz  = ai[row+1] - ai[row];
320:     while (nz-- > 0) {
321:       /* ja[work[i1]++] = i2 + oshift; */
322:       ja[work[i2]++] = i1 + oshift;
323:       i2++;
324:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
325:       if (nz > 0) i2 = tvc[col];
326:     }
327:   }
328:   PetscFree(ns_col);
329:   PetscFree(work);
330:   PetscFree2(tns,tvc);
331:   return(0);
332: }

334: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
335: {

339:   MatCreateColInode_Private(A,n,NULL);
340:   if (!ia) return(0);

342:   if (!blockcompressed) {
343:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
344:   } else if (symmetric) {
345:     /* Since the indices are symmetric it doesn't matter */
346:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
347:   } else {
348:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
349:   }
350:   return(0);
351: }

353: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
354: {

358:   if (!ia) return(0);
359:   if (!blockcompressed) {
360:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);
361:   } else {
362:     PetscFree(*ia);
363:     PetscFree(*ja);
364:   }
365:   return(0);
366: }

368: /* ----------------------------------------------------------- */

370: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
371: {
372:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
373:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
374:   PetscScalar       *y;
375:   const PetscScalar *x;
376:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
377:   PetscErrorCode    ierr;
378:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
379:   const PetscInt    *idx,*ns,*ii;

381: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
382: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
383: #endif

386:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
387:   node_max = a->inode.node_count;
388:   ns       = a->inode.size;     /* Node Size array */
389:   VecGetArrayRead(xx,&x);
390:   VecGetArray(yy,&y);
391:   idx      = a->j;
392:   v1       = a->a;
393:   ii       = a->i;

395:   for (i = 0,row = 0; i< node_max; ++i) {
396:     nsz         = ns[i];
397:     n           = ii[1] - ii[0];
398:     nonzerorow += (n>0)*nsz;
399:     ii         += nsz;
400:     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
401:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
402:     sz = n;                     /* No of non zeros in this row */
403:                                 /* Switch on the size of Node */
404:     switch (nsz) {               /* Each loop in 'case' is unrolled */
405:     case 1:
406:       sum1 = 0.;

408:       for (n = 0; n< sz-1; n+=2) {
409:         i1    = idx[0];         /* The instructions are ordered to */
410:         i2    = idx[1];         /* make the compiler's job easy */
411:         idx  += 2;
412:         tmp0  = x[i1];
413:         tmp1  = x[i2];
414:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
415:       }

417:       if (n == sz-1) {          /* Take care of the last nonzero  */
418:         tmp0  = x[*idx++];
419:         sum1 += *v1++ *tmp0;
420:       }
421:       y[row++]=sum1;
422:       break;
423:     case 2:
424:       sum1 = 0.;
425:       sum2 = 0.;
426:       v2   = v1 + n;

428:       for (n = 0; n< sz-1; n+=2) {
429:         i1    = idx[0];
430:         i2    = idx[1];
431:         idx  += 2;
432:         tmp0  = x[i1];
433:         tmp1  = x[i2];
434:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
436:       }
437:       if (n == sz-1) {
438:         tmp0  = x[*idx++];
439:         sum1 += *v1++ * tmp0;
440:         sum2 += *v2++ * tmp0;
441:       }
442:       y[row++]=sum1;
443:       y[row++]=sum2;
444:       v1      =v2;              /* Since the next block to be processed starts there*/
445:       idx    +=sz;
446:       break;
447:     case 3:
448:       sum1 = 0.;
449:       sum2 = 0.;
450:       sum3 = 0.;
451:       v2   = v1 + n;
452:       v3   = v2 + n;

454:       for (n = 0; n< sz-1; n+=2) {
455:         i1    = idx[0];
456:         i2    = idx[1];
457:         idx  += 2;
458:         tmp0  = x[i1];
459:         tmp1  = x[i2];
460:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
463:       }
464:       if (n == sz-1) {
465:         tmp0  = x[*idx++];
466:         sum1 += *v1++ * tmp0;
467:         sum2 += *v2++ * tmp0;
468:         sum3 += *v3++ * tmp0;
469:       }
470:       y[row++]=sum1;
471:       y[row++]=sum2;
472:       y[row++]=sum3;
473:       v1      =v3;              /* Since the next block to be processed starts there*/
474:       idx    +=2*sz;
475:       break;
476:     case 4:
477:       sum1 = 0.;
478:       sum2 = 0.;
479:       sum3 = 0.;
480:       sum4 = 0.;
481:       v2   = v1 + n;
482:       v3   = v2 + n;
483:       v4   = v3 + n;

485:       for (n = 0; n< sz-1; n+=2) {
486:         i1    = idx[0];
487:         i2    = idx[1];
488:         idx  += 2;
489:         tmp0  = x[i1];
490:         tmp1  = x[i2];
491:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
492:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
493:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
494:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
495:       }
496:       if (n == sz-1) {
497:         tmp0  = x[*idx++];
498:         sum1 += *v1++ * tmp0;
499:         sum2 += *v2++ * tmp0;
500:         sum3 += *v3++ * tmp0;
501:         sum4 += *v4++ * tmp0;
502:       }
503:       y[row++]=sum1;
504:       y[row++]=sum2;
505:       y[row++]=sum3;
506:       y[row++]=sum4;
507:       v1      =v4;              /* Since the next block to be processed starts there*/
508:       idx    +=3*sz;
509:       break;
510:     case 5:
511:       sum1 = 0.;
512:       sum2 = 0.;
513:       sum3 = 0.;
514:       sum4 = 0.;
515:       sum5 = 0.;
516:       v2   = v1 + n;
517:       v3   = v2 + n;
518:       v4   = v3 + n;
519:       v5   = v4 + n;

521:       for (n = 0; n<sz-1; n+=2) {
522:         i1    = idx[0];
523:         i2    = idx[1];
524:         idx  += 2;
525:         tmp0  = x[i1];
526:         tmp1  = x[i2];
527:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
528:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
529:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
530:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
531:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
532:       }
533:       if (n == sz-1) {
534:         tmp0  = x[*idx++];
535:         sum1 += *v1++ * tmp0;
536:         sum2 += *v2++ * tmp0;
537:         sum3 += *v3++ * tmp0;
538:         sum4 += *v4++ * tmp0;
539:         sum5 += *v5++ * tmp0;
540:       }
541:       y[row++]=sum1;
542:       y[row++]=sum2;
543:       y[row++]=sum3;
544:       y[row++]=sum4;
545:       y[row++]=sum5;
546:       v1      =v5;       /* Since the next block to be processed starts there */
547:       idx    +=4*sz;
548:       break;
549:     default:
550:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
551:     }
552:   }
553:   VecRestoreArrayRead(xx,&x);
554:   VecRestoreArray(yy,&y);
555:   PetscLogFlops(2.0*a->nz - nonzerorow);
556:   return(0);
557: }
558: /* ----------------------------------------------------------- */
559: /* Almost same code as the MatMult_SeqAIJ_Inode() */
560: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
561: {
562:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
563:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
564:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
565:   const PetscScalar *x;
566:   PetscScalar       *y,*z,*zt;
567:   PetscErrorCode    ierr;
568:   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
569:   const PetscInt    *idx,*ns,*ii;

572:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
573:   node_max = a->inode.node_count;
574:   ns       = a->inode.size;     /* Node Size array */

576:   VecGetArrayRead(xx,&x);
577:   VecGetArrayPair(zz,yy,&z,&y);
578:   zt = z;

580:   idx = a->j;
581:   v1  = a->a;
582:   ii  = a->i;

584:   for (i = 0,row = 0; i< node_max; ++i) {
585:     nsz = ns[i];
586:     n   = ii[1] - ii[0];
587:     ii += nsz;
588:     sz  = n;                    /* No of non zeros in this row */
589:                                 /* Switch on the size of Node */
590:     switch (nsz) {               /* Each loop in 'case' is unrolled */
591:     case 1:
592:       sum1 = *zt++;

594:       for (n = 0; n< sz-1; n+=2) {
595:         i1    = idx[0];         /* The instructions are ordered to */
596:         i2    = idx[1];         /* make the compiler's job easy */
597:         idx  += 2;
598:         tmp0  = x[i1];
599:         tmp1  = x[i2];
600:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
601:       }

603:       if (n   == sz-1) {          /* Take care of the last nonzero  */
604:         tmp0  = x[*idx++];
605:         sum1 += *v1++ * tmp0;
606:       }
607:       y[row++]=sum1;
608:       break;
609:     case 2:
610:       sum1 = *zt++;
611:       sum2 = *zt++;
612:       v2   = v1 + n;

614:       for (n = 0; n< sz-1; n+=2) {
615:         i1    = idx[0];
616:         i2    = idx[1];
617:         idx  += 2;
618:         tmp0  = x[i1];
619:         tmp1  = x[i2];
620:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
621:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
622:       }
623:       if (n   == sz-1) {
624:         tmp0  = x[*idx++];
625:         sum1 += *v1++ * tmp0;
626:         sum2 += *v2++ * tmp0;
627:       }
628:       y[row++]=sum1;
629:       y[row++]=sum2;
630:       v1      =v2;              /* Since the next block to be processed starts there*/
631:       idx    +=sz;
632:       break;
633:     case 3:
634:       sum1 = *zt++;
635:       sum2 = *zt++;
636:       sum3 = *zt++;
637:       v2   = v1 + n;
638:       v3   = v2 + n;

640:       for (n = 0; n< sz-1; n+=2) {
641:         i1    = idx[0];
642:         i2    = idx[1];
643:         idx  += 2;
644:         tmp0  = x[i1];
645:         tmp1  = x[i2];
646:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
647:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
648:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
649:       }
650:       if (n == sz-1) {
651:         tmp0  = x[*idx++];
652:         sum1 += *v1++ * tmp0;
653:         sum2 += *v2++ * tmp0;
654:         sum3 += *v3++ * tmp0;
655:       }
656:       y[row++]=sum1;
657:       y[row++]=sum2;
658:       y[row++]=sum3;
659:       v1      =v3;              /* Since the next block to be processed starts there*/
660:       idx    +=2*sz;
661:       break;
662:     case 4:
663:       sum1 = *zt++;
664:       sum2 = *zt++;
665:       sum3 = *zt++;
666:       sum4 = *zt++;
667:       v2   = v1 + n;
668:       v3   = v2 + n;
669:       v4   = v3 + n;

671:       for (n = 0; n< sz-1; n+=2) {
672:         i1    = idx[0];
673:         i2    = idx[1];
674:         idx  += 2;
675:         tmp0  = x[i1];
676:         tmp1  = x[i2];
677:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
678:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
679:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
680:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
681:       }
682:       if (n == sz-1) {
683:         tmp0  = x[*idx++];
684:         sum1 += *v1++ * tmp0;
685:         sum2 += *v2++ * tmp0;
686:         sum3 += *v3++ * tmp0;
687:         sum4 += *v4++ * tmp0;
688:       }
689:       y[row++]=sum1;
690:       y[row++]=sum2;
691:       y[row++]=sum3;
692:       y[row++]=sum4;
693:       v1      =v4;              /* Since the next block to be processed starts there*/
694:       idx    +=3*sz;
695:       break;
696:     case 5:
697:       sum1 = *zt++;
698:       sum2 = *zt++;
699:       sum3 = *zt++;
700:       sum4 = *zt++;
701:       sum5 = *zt++;
702:       v2   = v1 + n;
703:       v3   = v2 + n;
704:       v4   = v3 + n;
705:       v5   = v4 + n;

707:       for (n = 0; n<sz-1; n+=2) {
708:         i1    = idx[0];
709:         i2    = idx[1];
710:         idx  += 2;
711:         tmp0  = x[i1];
712:         tmp1  = x[i2];
713:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
714:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
715:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
716:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
717:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
718:       }
719:       if (n   == sz-1) {
720:         tmp0  = x[*idx++];
721:         sum1 += *v1++ * tmp0;
722:         sum2 += *v2++ * tmp0;
723:         sum3 += *v3++ * tmp0;
724:         sum4 += *v4++ * tmp0;
725:         sum5 += *v5++ * tmp0;
726:       }
727:       y[row++]=sum1;
728:       y[row++]=sum2;
729:       y[row++]=sum3;
730:       y[row++]=sum4;
731:       y[row++]=sum5;
732:       v1      =v5;       /* Since the next block to be processed starts there */
733:       idx    +=4*sz;
734:       break;
735:     default:
736:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
737:     }
738:   }
739:   VecRestoreArrayRead(xx,&x);
740:   VecRestoreArrayPair(zz,yy,&z,&y);
741:   PetscLogFlops(2.0*a->nz);
742:   return(0);
743: }

745: /* ----------------------------------------------------------- */
746: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
747: {
748:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
749:   IS                iscol = a->col,isrow = a->row;
750:   PetscErrorCode    ierr;
751:   const PetscInt    *r,*c,*rout,*cout;
752:   PetscInt          i,j,n = A->rmap->n,nz;
753:   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
754:   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
755:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
756:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
757:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
758:   const PetscScalar *b;

761:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
762:   node_max = a->inode.node_count;
763:   ns       = a->inode.size;     /* Node Size array */

765:   VecGetArrayRead(bb,&b);
766:   VecGetArrayWrite(xx,&x);
767:   tmp  = a->solve_work;

769:   ISGetIndices(isrow,&rout); r = rout;
770:   ISGetIndices(iscol,&cout); c = cout + (n-1);

772:   /* forward solve the lower triangular */
773:   tmps = tmp;
774:   aa   = a_a;
775:   aj   = a_j;
776:   ad   = a->diag;

778:   for (i = 0,row = 0; i< node_max; ++i) {
779:     nsz = ns[i];
780:     aii = ai[row];
781:     v1  = aa + aii;
782:     vi  = aj + aii;
783:     nz  = ad[row]- aii;
784:     if (i < node_max-1) {
785:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
786:       * but our indexing to determine it's size could. */
787:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
788:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
789:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
790:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
791:     }

793:     switch (nsz) {               /* Each loop in 'case' is unrolled */
794:     case 1:
795:       sum1 = b[*r++];
796:       for (j=0; j<nz-1; j+=2) {
797:         i0    = vi[0];
798:         i1    = vi[1];
799:         vi   +=2;
800:         tmp0  = tmps[i0];
801:         tmp1  = tmps[i1];
802:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
803:       }
804:       if (j == nz-1) {
805:         tmp0  = tmps[*vi++];
806:         sum1 -= *v1++ *tmp0;
807:       }
808:       tmp[row++]=sum1;
809:       break;
810:     case 2:
811:       sum1 = b[*r++];
812:       sum2 = b[*r++];
813:       v2   = aa + ai[row+1];

815:       for (j=0; j<nz-1; j+=2) {
816:         i0    = vi[0];
817:         i1    = vi[1];
818:         vi   +=2;
819:         tmp0  = tmps[i0];
820:         tmp1  = tmps[i1];
821:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
823:       }
824:       if (j == nz-1) {
825:         tmp0  = tmps[*vi++];
826:         sum1 -= *v1++ *tmp0;
827:         sum2 -= *v2++ *tmp0;
828:       }
829:       sum2     -= *v2++ *sum1;
830:       tmp[row++]=sum1;
831:       tmp[row++]=sum2;
832:       break;
833:     case 3:
834:       sum1 = b[*r++];
835:       sum2 = b[*r++];
836:       sum3 = b[*r++];
837:       v2   = aa + ai[row+1];
838:       v3   = aa + ai[row+2];

840:       for (j=0; j<nz-1; j+=2) {
841:         i0    = vi[0];
842:         i1    = vi[1];
843:         vi   +=2;
844:         tmp0  = tmps[i0];
845:         tmp1  = tmps[i1];
846:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
849:       }
850:       if (j == nz-1) {
851:         tmp0  = tmps[*vi++];
852:         sum1 -= *v1++ *tmp0;
853:         sum2 -= *v2++ *tmp0;
854:         sum3 -= *v3++ *tmp0;
855:       }
856:       sum2 -= *v2++ * sum1;
857:       sum3 -= *v3++ * sum1;
858:       sum3 -= *v3++ * sum2;

860:       tmp[row++]=sum1;
861:       tmp[row++]=sum2;
862:       tmp[row++]=sum3;
863:       break;

865:     case 4:
866:       sum1 = b[*r++];
867:       sum2 = b[*r++];
868:       sum3 = b[*r++];
869:       sum4 = b[*r++];
870:       v2   = aa + ai[row+1];
871:       v3   = aa + ai[row+2];
872:       v4   = aa + ai[row+3];

874:       for (j=0; j<nz-1; j+=2) {
875:         i0    = vi[0];
876:         i1    = vi[1];
877:         vi   +=2;
878:         tmp0  = tmps[i0];
879:         tmp1  = tmps[i1];
880:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
884:       }
885:       if (j == nz-1) {
886:         tmp0  = tmps[*vi++];
887:         sum1 -= *v1++ *tmp0;
888:         sum2 -= *v2++ *tmp0;
889:         sum3 -= *v3++ *tmp0;
890:         sum4 -= *v4++ *tmp0;
891:       }
892:       sum2 -= *v2++ * sum1;
893:       sum3 -= *v3++ * sum1;
894:       sum4 -= *v4++ * sum1;
895:       sum3 -= *v3++ * sum2;
896:       sum4 -= *v4++ * sum2;
897:       sum4 -= *v4++ * sum3;

899:       tmp[row++]=sum1;
900:       tmp[row++]=sum2;
901:       tmp[row++]=sum3;
902:       tmp[row++]=sum4;
903:       break;
904:     case 5:
905:       sum1 = b[*r++];
906:       sum2 = b[*r++];
907:       sum3 = b[*r++];
908:       sum4 = b[*r++];
909:       sum5 = b[*r++];
910:       v2   = aa + ai[row+1];
911:       v3   = aa + ai[row+2];
912:       v4   = aa + ai[row+3];
913:       v5   = aa + ai[row+4];

915:       for (j=0; j<nz-1; j+=2) {
916:         i0    = vi[0];
917:         i1    = vi[1];
918:         vi   +=2;
919:         tmp0  = tmps[i0];
920:         tmp1  = tmps[i1];
921:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
922:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
923:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
924:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
925:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
926:       }
927:       if (j == nz-1) {
928:         tmp0  = tmps[*vi++];
929:         sum1 -= *v1++ *tmp0;
930:         sum2 -= *v2++ *tmp0;
931:         sum3 -= *v3++ *tmp0;
932:         sum4 -= *v4++ *tmp0;
933:         sum5 -= *v5++ *tmp0;
934:       }

936:       sum2 -= *v2++ * sum1;
937:       sum3 -= *v3++ * sum1;
938:       sum4 -= *v4++ * sum1;
939:       sum5 -= *v5++ * sum1;
940:       sum3 -= *v3++ * sum2;
941:       sum4 -= *v4++ * sum2;
942:       sum5 -= *v5++ * sum2;
943:       sum4 -= *v4++ * sum3;
944:       sum5 -= *v5++ * sum3;
945:       sum5 -= *v5++ * sum4;

947:       tmp[row++]=sum1;
948:       tmp[row++]=sum2;
949:       tmp[row++]=sum3;
950:       tmp[row++]=sum4;
951:       tmp[row++]=sum5;
952:       break;
953:     default:
954:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
955:     }
956:   }
957:   /* backward solve the upper triangular */
958:   for (i=node_max -1,row = n-1; i>=0; i--) {
959:     nsz = ns[i];
960:     aii = ai[row+1] -1;
961:     v1  = aa + aii;
962:     vi  = aj + aii;
963:     nz  = aii- ad[row];
964:     switch (nsz) {               /* Each loop in 'case' is unrolled */
965:     case 1:
966:       sum1 = tmp[row];

968:       for (j=nz; j>1; j-=2) {
969:         vi   -=2;
970:         i0    = vi[2];
971:         i1    = vi[1];
972:         tmp0  = tmps[i0];
973:         tmp1  = tmps[i1];
974:         v1   -= 2;
975:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
976:       }
977:       if (j==1) {
978:         tmp0  = tmps[*vi--];
979:         sum1 -= *v1-- * tmp0;
980:       }
981:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
982:       break;
983:     case 2:
984:       sum1 = tmp[row];
985:       sum2 = tmp[row -1];
986:       v2   = aa + ai[row]-1;
987:       for (j=nz; j>1; j-=2) {
988:         vi   -=2;
989:         i0    = vi[2];
990:         i1    = vi[1];
991:         tmp0  = tmps[i0];
992:         tmp1  = tmps[i1];
993:         v1   -= 2;
994:         v2   -= 2;
995:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
996:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
997:       }
998:       if (j==1) {
999:         tmp0  = tmps[*vi--];
1000:         sum1 -= *v1-- * tmp0;
1001:         sum2 -= *v2-- * tmp0;
1002:       }

1004:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1005:       sum2   -= *v2-- * tmp0;
1006:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1007:       break;
1008:     case 3:
1009:       sum1 = tmp[row];
1010:       sum2 = tmp[row -1];
1011:       sum3 = tmp[row -2];
1012:       v2   = aa + ai[row]-1;
1013:       v3   = aa + ai[row -1]-1;
1014:       for (j=nz; j>1; j-=2) {
1015:         vi   -=2;
1016:         i0    = vi[2];
1017:         i1    = vi[1];
1018:         tmp0  = tmps[i0];
1019:         tmp1  = tmps[i1];
1020:         v1   -= 2;
1021:         v2   -= 2;
1022:         v3   -= 2;
1023:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1026:       }
1027:       if (j==1) {
1028:         tmp0  = tmps[*vi--];
1029:         sum1 -= *v1-- * tmp0;
1030:         sum2 -= *v2-- * tmp0;
1031:         sum3 -= *v3-- * tmp0;
1032:       }
1033:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1034:       sum2   -= *v2-- * tmp0;
1035:       sum3   -= *v3-- * tmp0;
1036:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1037:       sum3   -= *v3-- * tmp0;
1038:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;

1040:       break;
1041:     case 4:
1042:       sum1 = tmp[row];
1043:       sum2 = tmp[row -1];
1044:       sum3 = tmp[row -2];
1045:       sum4 = tmp[row -3];
1046:       v2   = aa + ai[row]-1;
1047:       v3   = aa + ai[row -1]-1;
1048:       v4   = aa + ai[row -2]-1;

1050:       for (j=nz; j>1; j-=2) {
1051:         vi   -=2;
1052:         i0    = vi[2];
1053:         i1    = vi[1];
1054:         tmp0  = tmps[i0];
1055:         tmp1  = tmps[i1];
1056:         v1   -= 2;
1057:         v2   -= 2;
1058:         v3   -= 2;
1059:         v4   -= 2;
1060:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1061:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1062:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1063:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1064:       }
1065:       if (j==1) {
1066:         tmp0  = tmps[*vi--];
1067:         sum1 -= *v1-- * tmp0;
1068:         sum2 -= *v2-- * tmp0;
1069:         sum3 -= *v3-- * tmp0;
1070:         sum4 -= *v4-- * tmp0;
1071:       }

1073:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1074:       sum2   -= *v2-- * tmp0;
1075:       sum3   -= *v3-- * tmp0;
1076:       sum4   -= *v4-- * tmp0;
1077:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1078:       sum3   -= *v3-- * tmp0;
1079:       sum4   -= *v4-- * tmp0;
1080:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1081:       sum4   -= *v4-- * tmp0;
1082:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1083:       break;
1084:     case 5:
1085:       sum1 = tmp[row];
1086:       sum2 = tmp[row -1];
1087:       sum3 = tmp[row -2];
1088:       sum4 = tmp[row -3];
1089:       sum5 = tmp[row -4];
1090:       v2   = aa + ai[row]-1;
1091:       v3   = aa + ai[row -1]-1;
1092:       v4   = aa + ai[row -2]-1;
1093:       v5   = aa + ai[row -3]-1;
1094:       for (j=nz; j>1; j-=2) {
1095:         vi   -= 2;
1096:         i0    = vi[2];
1097:         i1    = vi[1];
1098:         tmp0  = tmps[i0];
1099:         tmp1  = tmps[i1];
1100:         v1   -= 2;
1101:         v2   -= 2;
1102:         v3   -= 2;
1103:         v4   -= 2;
1104:         v5   -= 2;
1105:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1106:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1107:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1108:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1109:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1110:       }
1111:       if (j==1) {
1112:         tmp0  = tmps[*vi--];
1113:         sum1 -= *v1-- * tmp0;
1114:         sum2 -= *v2-- * tmp0;
1115:         sum3 -= *v3-- * tmp0;
1116:         sum4 -= *v4-- * tmp0;
1117:         sum5 -= *v5-- * tmp0;
1118:       }

1120:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1121:       sum2   -= *v2-- * tmp0;
1122:       sum3   -= *v3-- * tmp0;
1123:       sum4   -= *v4-- * tmp0;
1124:       sum5   -= *v5-- * tmp0;
1125:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1126:       sum3   -= *v3-- * tmp0;
1127:       sum4   -= *v4-- * tmp0;
1128:       sum5   -= *v5-- * tmp0;
1129:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1130:       sum4   -= *v4-- * tmp0;
1131:       sum5   -= *v5-- * tmp0;
1132:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1133:       sum5   -= *v5-- * tmp0;
1134:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1135:       break;
1136:     default:
1137:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1138:     }
1139:   }
1140:   ISRestoreIndices(isrow,&rout);
1141:   ISRestoreIndices(iscol,&cout);
1142:   VecRestoreArrayRead(bb,&b);
1143:   VecRestoreArrayWrite(xx,&x);
1144:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1145:   return(0);
1146: }

1148: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1149: {
1150:   Mat             C     =B;
1151:   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1152:   IS              isrow = b->row,isicol = b->icol;
1153:   PetscErrorCode  ierr;
1154:   const PetscInt  *r,*ic,*ics;
1155:   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1156:   PetscInt        i,j,k,nz,nzL,row,*pj;
1157:   const PetscInt  *ajtmp,*bjtmp;
1158:   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1159:   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1160:   FactorShiftCtx  sctx;
1161:   const PetscInt  *ddiag;
1162:   PetscReal       rs;
1163:   MatScalar       d;
1164:   PetscInt        inod,nodesz,node_max,col;
1165:   const PetscInt  *ns;
1166:   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;

1169:   /* MatPivotSetUp(): initialize shift context sctx */
1170:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1172:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1173:     ddiag          = a->diag;
1174:     sctx.shift_top = info->zeropivot;
1175:     for (i=0; i<n; i++) {
1176:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1177:       d  = (aa)[ddiag[i]];
1178:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1179:       v  = aa+ai[i];
1180:       nz = ai[i+1] - ai[i];
1181:       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1182:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1183:     }
1184:     sctx.shift_top *= 1.1;
1185:     sctx.nshift_max = 5;
1186:     sctx.shift_lo   = 0.;
1187:     sctx.shift_hi   = 1.;
1188:   }

1190:   ISGetIndices(isrow,&r);
1191:   ISGetIndices(isicol,&ic);

1193:   PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1194:   ics   = ic;

1196:   node_max = a->inode.node_count;
1197:   ns       = a->inode.size;
1198:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1200:   /* If max inode size > 4, split it into two inodes.*/
1201:   /* also map the inode sizes according to the ordering */
1202:   PetscMalloc1(n+1,&tmp_vec1);
1203:   for (i=0,j=0; i<node_max; ++i,++j) {
1204:     if (ns[i] > 4) {
1205:       tmp_vec1[j] = 4;
1206:       ++j;
1207:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1208:     } else {
1209:       tmp_vec1[j] = ns[i];
1210:     }
1211:   }
1212:   /* Use the correct node_max */
1213:   node_max = j;

1215:   /* Now reorder the inode info based on mat re-ordering info */
1216:   /* First create a row -> inode_size_array_index map */
1217:   PetscMalloc1(n+1,&nsmap);
1218:   PetscMalloc1(node_max+1,&tmp_vec2);
1219:   for (i=0,row=0; i<node_max; i++) {
1220:     nodesz = tmp_vec1[i];
1221:     for (j=0; j<nodesz; j++,row++) {
1222:       nsmap[row] = i;
1223:     }
1224:   }
1225:   /* Using nsmap, create a reordered ns structure */
1226:   for (i=0,j=0; i< node_max; i++) {
1227:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1228:     tmp_vec2[i] = nodesz;
1229:     j          += nodesz;
1230:   }
1231:   PetscFree(nsmap);
1232:   PetscFree(tmp_vec1);

1234:   /* Now use the correct ns */
1235:   ns = tmp_vec2;

1237:   do {
1238:     sctx.newshift = PETSC_FALSE;
1239:     /* Now loop over each block-row, and do the factorization */
1240:     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1241:       nodesz = ns[inod];

1243:       switch (nodesz) {
1244:       case 1:
1245:         /*----------*/
1246:         /* zero rtmp1 */
1247:         /* L part */
1248:         nz    = bi[i+1] - bi[i];
1249:         bjtmp = bj + bi[i];
1250:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1252:         /* U part */
1253:         nz    = bdiag[i]-bdiag[i+1];
1254:         bjtmp = bj + bdiag[i+1]+1;
1255:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1257:         /* load in initial (unfactored row) */
1258:         nz    = ai[r[i]+1] - ai[r[i]];
1259:         ajtmp = aj + ai[r[i]];
1260:         v     = aa + ai[r[i]];
1261:         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1263:         /* ZeropivotApply() */
1264:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */

1266:         /* elimination */
1267:         bjtmp = bj + bi[i];
1268:         row   = *bjtmp++;
1269:         nzL   = bi[i+1] - bi[i];
1270:         for (k=0; k < nzL; k++) {
1271:           pc = rtmp1 + row;
1272:           if (*pc != 0.0) {
1273:             pv   = b->a + bdiag[row];
1274:             mul1 = *pc * (*pv);
1275:             *pc  = mul1;
1276:             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1277:             pv   = b->a + bdiag[row+1]+1;
1278:             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1279:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1280:             PetscLogFlops(1+2.0*nz);
1281:           }
1282:           row = *bjtmp++;
1283:         }

1285:         /* finished row so stick it into b->a */
1286:         rs = 0.0;
1287:         /* L part */
1288:         pv = b->a + bi[i];
1289:         pj = b->j + bi[i];
1290:         nz = bi[i+1] - bi[i];
1291:         for (j=0; j<nz; j++) {
1292:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1293:         }

1295:         /* U part */
1296:         pv = b->a + bdiag[i+1]+1;
1297:         pj = b->j + bdiag[i+1]+1;
1298:         nz = bdiag[i] - bdiag[i+1]-1;
1299:         for (j=0; j<nz; j++) {
1300:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1301:         }

1303:         /* Check zero pivot */
1304:         sctx.rs = rs;
1305:         sctx.pv = rtmp1[i];
1306:         MatPivotCheck(B,A,info,&sctx,i);
1307:         if (sctx.newshift) break;

1309:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1310:         pv  = b->a + bdiag[i];
1311:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1312:         break;

1314:       case 2:
1315:         /*----------*/
1316:         /* zero rtmp1 and rtmp2 */
1317:         /* L part */
1318:         nz    = bi[i+1] - bi[i];
1319:         bjtmp = bj + bi[i];
1320:         for  (j=0; j<nz; j++) {
1321:           col        = bjtmp[j];
1322:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1323:         }

1325:         /* U part */
1326:         nz    = bdiag[i]-bdiag[i+1];
1327:         bjtmp = bj + bdiag[i+1]+1;
1328:         for  (j=0; j<nz; j++) {
1329:           col        = bjtmp[j];
1330:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1331:         }

1333:         /* load in initial (unfactored row) */
1334:         nz    = ai[r[i]+1] - ai[r[i]];
1335:         ajtmp = aj + ai[r[i]];
1336:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1337:         for (j=0; j<nz; j++) {
1338:           col        = ics[ajtmp[j]];
1339:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1340:         }
1341:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1342:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;

1344:         /* elimination */
1345:         bjtmp = bj + bi[i];
1346:         row   = *bjtmp++; /* pivot row */
1347:         nzL   = bi[i+1] - bi[i];
1348:         for (k=0; k < nzL; k++) {
1349:           pc1 = rtmp1 + row;
1350:           pc2 = rtmp2 + row;
1351:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1352:             pv   = b->a + bdiag[row];
1353:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1354:             *pc1 = mul1;       *pc2 = mul2;

1356:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1357:             pv = b->a + bdiag[row+1]+1;
1358:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1359:             for (j=0; j<nz; j++) {
1360:               col         = pj[j];
1361:               rtmp1[col] -= mul1 * pv[j];
1362:               rtmp2[col] -= mul2 * pv[j];
1363:             }
1364:             PetscLogFlops(2+4.0*nz);
1365:           }
1366:           row = *bjtmp++;
1367:         }

1369:         /* finished row i; check zero pivot, then stick row i into b->a */
1370:         rs = 0.0;
1371:         /* L part */
1372:         pc1 = b->a + bi[i];
1373:         pj  = b->j + bi[i];
1374:         nz  = bi[i+1] - bi[i];
1375:         for (j=0; j<nz; j++) {
1376:           col    = pj[j];
1377:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1378:         }
1379:         /* U part */
1380:         pc1 = b->a + bdiag[i+1]+1;
1381:         pj  = b->j + bdiag[i+1]+1;
1382:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1383:         for (j=0; j<nz; j++) {
1384:           col    = pj[j];
1385:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1386:         }

1388:         sctx.rs = rs;
1389:         sctx.pv = rtmp1[i];
1390:         MatPivotCheck(B,A,info,&sctx,i);
1391:         if (sctx.newshift) break;
1392:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1393:         *pc1 = 1.0/sctx.pv;

1395:         /* Now take care of diagonal 2x2 block. */
1396:         pc2 = rtmp2 + i;
1397:         if (*pc2 != 0.0) {
1398:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1399:           *pc2 = mul1;          /* insert L entry */
1400:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1401:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1402:           for (j=0; j<nz; j++) {
1403:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1404:           }
1405:           PetscLogFlops(1+2.0*nz);
1406:         }

1408:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1409:         rs = 0.0;
1410:         /* L part */
1411:         pc2 = b->a + bi[i+1];
1412:         pj  = b->j + bi[i+1];
1413:         nz  = bi[i+2] - bi[i+1];
1414:         for (j=0; j<nz; j++) {
1415:           col    = pj[j];
1416:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1417:         }
1418:         /* U part */
1419:         pc2 = b->a + bdiag[i+2]+1;
1420:         pj  = b->j + bdiag[i+2]+1;
1421:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1422:         for (j=0; j<nz; j++) {
1423:           col    = pj[j];
1424:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1425:         }

1427:         sctx.rs = rs;
1428:         sctx.pv = rtmp2[i+1];
1429:         MatPivotCheck(B,A,info,&sctx,i+1);
1430:         if (sctx.newshift) break;
1431:         pc2  = b->a + bdiag[i+1];
1432:         *pc2 = 1.0/sctx.pv;
1433:         break;

1435:       case 3:
1436:         /*----------*/
1437:         /* zero rtmp */
1438:         /* L part */
1439:         nz    = bi[i+1] - bi[i];
1440:         bjtmp = bj + bi[i];
1441:         for  (j=0; j<nz; j++) {
1442:           col        = bjtmp[j];
1443:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1444:         }

1446:         /* U part */
1447:         nz    = bdiag[i]-bdiag[i+1];
1448:         bjtmp = bj + bdiag[i+1]+1;
1449:         for  (j=0; j<nz; j++) {
1450:           col        = bjtmp[j];
1451:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1452:         }

1454:         /* load in initial (unfactored row) */
1455:         nz    = ai[r[i]+1] - ai[r[i]];
1456:         ajtmp = aj + ai[r[i]];
1457:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1458:         for (j=0; j<nz; j++) {
1459:           col        = ics[ajtmp[j]];
1460:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1461:         }
1462:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1463:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;

1465:         /* elimination */
1466:         bjtmp = bj + bi[i];
1467:         row   = *bjtmp++; /* pivot row */
1468:         nzL   = bi[i+1] - bi[i];
1469:         for (k=0; k < nzL; k++) {
1470:           pc1 = rtmp1 + row;
1471:           pc2 = rtmp2 + row;
1472:           pc3 = rtmp3 + row;
1473:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1474:             pv   = b->a + bdiag[row];
1475:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1476:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1478:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1479:             pv = b->a + bdiag[row+1]+1;
1480:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1481:             for (j=0; j<nz; j++) {
1482:               col         = pj[j];
1483:               rtmp1[col] -= mul1 * pv[j];
1484:               rtmp2[col] -= mul2 * pv[j];
1485:               rtmp3[col] -= mul3 * pv[j];
1486:             }
1487:             PetscLogFlops(3+6.0*nz);
1488:           }
1489:           row = *bjtmp++;
1490:         }

1492:         /* finished row i; check zero pivot, then stick row i into b->a */
1493:         rs = 0.0;
1494:         /* L part */
1495:         pc1 = b->a + bi[i];
1496:         pj  = b->j + bi[i];
1497:         nz  = bi[i+1] - bi[i];
1498:         for (j=0; j<nz; j++) {
1499:           col    = pj[j];
1500:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1501:         }
1502:         /* U part */
1503:         pc1 = b->a + bdiag[i+1]+1;
1504:         pj  = b->j + bdiag[i+1]+1;
1505:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1506:         for (j=0; j<nz; j++) {
1507:           col    = pj[j];
1508:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1509:         }

1511:         sctx.rs = rs;
1512:         sctx.pv = rtmp1[i];
1513:         MatPivotCheck(B,A,info,&sctx,i);
1514:         if (sctx.newshift) break;
1515:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1516:         *pc1 = 1.0/sctx.pv;

1518:         /* Now take care of 1st column of diagonal 3x3 block. */
1519:         pc2 = rtmp2 + i;
1520:         pc3 = rtmp3 + i;
1521:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1522:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1523:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1524:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1525:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1526:           for (j=0; j<nz; j++) {
1527:             col         = pj[j];
1528:             rtmp2[col] -= mul2 * rtmp1[col];
1529:             rtmp3[col] -= mul3 * rtmp1[col];
1530:           }
1531:           PetscLogFlops(2+4.0*nz);
1532:         }

1534:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1535:         rs = 0.0;
1536:         /* L part */
1537:         pc2 = b->a + bi[i+1];
1538:         pj  = b->j + bi[i+1];
1539:         nz  = bi[i+2] - bi[i+1];
1540:         for (j=0; j<nz; j++) {
1541:           col    = pj[j];
1542:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1543:         }
1544:         /* U part */
1545:         pc2 = b->a + bdiag[i+2]+1;
1546:         pj  = b->j + bdiag[i+2]+1;
1547:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1548:         for (j=0; j<nz; j++) {
1549:           col    = pj[j];
1550:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1551:         }

1553:         sctx.rs = rs;
1554:         sctx.pv = rtmp2[i+1];
1555:         MatPivotCheck(B,A,info,&sctx,i+1);
1556:         if (sctx.newshift) break;
1557:         pc2  = b->a + bdiag[i+1];
1558:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1560:         /* Now take care of 2nd column of diagonal 3x3 block. */
1561:         pc3 = rtmp3 + i+1;
1562:         if (*pc3 != 0.0) {
1563:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1564:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1565:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1566:           for (j=0; j<nz; j++) {
1567:             col         = pj[j];
1568:             rtmp3[col] -= mul3 * rtmp2[col];
1569:           }
1570:           PetscLogFlops(1+2.0*nz);
1571:         }

1573:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1574:         rs = 0.0;
1575:         /* L part */
1576:         pc3 = b->a + bi[i+2];
1577:         pj  = b->j + bi[i+2];
1578:         nz  = bi[i+3] - bi[i+2];
1579:         for (j=0; j<nz; j++) {
1580:           col    = pj[j];
1581:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1582:         }
1583:         /* U part */
1584:         pc3 = b->a + bdiag[i+3]+1;
1585:         pj  = b->j + bdiag[i+3]+1;
1586:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1587:         for (j=0; j<nz; j++) {
1588:           col    = pj[j];
1589:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1590:         }

1592:         sctx.rs = rs;
1593:         sctx.pv = rtmp3[i+2];
1594:         MatPivotCheck(B,A,info,&sctx,i+2);
1595:         if (sctx.newshift) break;
1596:         pc3  = b->a + bdiag[i+2];
1597:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1598:         break;
1599:       case 4:
1600:         /*----------*/
1601:         /* zero rtmp */
1602:         /* L part */
1603:         nz    = bi[i+1] - bi[i];
1604:         bjtmp = bj + bi[i];
1605:         for  (j=0; j<nz; j++) {
1606:           col        = bjtmp[j];
1607:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1608:         }

1610:         /* U part */
1611:         nz    = bdiag[i]-bdiag[i+1];
1612:         bjtmp = bj + bdiag[i+1]+1;
1613:         for  (j=0; j<nz; j++) {
1614:           col        = bjtmp[j];
1615:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1616:         }

1618:         /* load in initial (unfactored row) */
1619:         nz    = ai[r[i]+1] - ai[r[i]];
1620:         ajtmp = aj + ai[r[i]];
1621:         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1622:         for (j=0; j<nz; j++) {
1623:           col        = ics[ajtmp[j]];
1624:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1625:         }
1626:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1627:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;

1629:         /* elimination */
1630:         bjtmp = bj + bi[i];
1631:         row   = *bjtmp++; /* pivot row */
1632:         nzL   = bi[i+1] - bi[i];
1633:         for (k=0; k < nzL; k++) {
1634:           pc1 = rtmp1 + row;
1635:           pc2 = rtmp2 + row;
1636:           pc3 = rtmp3 + row;
1637:           pc4 = rtmp4 + row;
1638:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1639:             pv   = b->a + bdiag[row];
1640:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1641:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;

1643:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1644:             pv = b->a + bdiag[row+1]+1;
1645:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1646:             for (j=0; j<nz; j++) {
1647:               col         = pj[j];
1648:               rtmp1[col] -= mul1 * pv[j];
1649:               rtmp2[col] -= mul2 * pv[j];
1650:               rtmp3[col] -= mul3 * pv[j];
1651:               rtmp4[col] -= mul4 * pv[j];
1652:             }
1653:             PetscLogFlops(4+8.0*nz);
1654:           }
1655:           row = *bjtmp++;
1656:         }

1658:         /* finished row i; check zero pivot, then stick row i into b->a */
1659:         rs = 0.0;
1660:         /* L part */
1661:         pc1 = b->a + bi[i];
1662:         pj  = b->j + bi[i];
1663:         nz  = bi[i+1] - bi[i];
1664:         for (j=0; j<nz; j++) {
1665:           col    = pj[j];
1666:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1667:         }
1668:         /* U part */
1669:         pc1 = b->a + bdiag[i+1]+1;
1670:         pj  = b->j + bdiag[i+1]+1;
1671:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1672:         for (j=0; j<nz; j++) {
1673:           col    = pj[j];
1674:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1675:         }

1677:         sctx.rs = rs;
1678:         sctx.pv = rtmp1[i];
1679:         MatPivotCheck(B,A,info,&sctx,i);
1680:         if (sctx.newshift) break;
1681:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1682:         *pc1 = 1.0/sctx.pv;

1684:         /* Now take care of 1st column of diagonal 4x4 block. */
1685:         pc2 = rtmp2 + i;
1686:         pc3 = rtmp3 + i;
1687:         pc4 = rtmp4 + i;
1688:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1689:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1690:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1691:           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1692:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1693:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1694:           for (j=0; j<nz; j++) {
1695:             col         = pj[j];
1696:             rtmp2[col] -= mul2 * rtmp1[col];
1697:             rtmp3[col] -= mul3 * rtmp1[col];
1698:             rtmp4[col] -= mul4 * rtmp1[col];
1699:           }
1700:           PetscLogFlops(3+6.0*nz);
1701:         }

1703:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1704:         rs = 0.0;
1705:         /* L part */
1706:         pc2 = b->a + bi[i+1];
1707:         pj  = b->j + bi[i+1];
1708:         nz  = bi[i+2] - bi[i+1];
1709:         for (j=0; j<nz; j++) {
1710:           col    = pj[j];
1711:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1712:         }
1713:         /* U part */
1714:         pc2 = b->a + bdiag[i+2]+1;
1715:         pj  = b->j + bdiag[i+2]+1;
1716:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1717:         for (j=0; j<nz; j++) {
1718:           col    = pj[j];
1719:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1720:         }

1722:         sctx.rs = rs;
1723:         sctx.pv = rtmp2[i+1];
1724:         MatPivotCheck(B,A,info,&sctx,i+1);
1725:         if (sctx.newshift) break;
1726:         pc2  = b->a + bdiag[i+1];
1727:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1729:         /* Now take care of 2nd column of diagonal 4x4 block. */
1730:         pc3 = rtmp3 + i+1;
1731:         pc4 = rtmp4 + i+1;
1732:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1733:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1734:           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1735:           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1736:           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1737:           for (j=0; j<nz; j++) {
1738:             col         = pj[j];
1739:             rtmp3[col] -= mul3 * rtmp2[col];
1740:             rtmp4[col] -= mul4 * rtmp2[col];
1741:           }
1742:           PetscLogFlops(4.0*nz);
1743:         }

1745:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1746:         rs = 0.0;
1747:         /* L part */
1748:         pc3 = b->a + bi[i+2];
1749:         pj  = b->j + bi[i+2];
1750:         nz  = bi[i+3] - bi[i+2];
1751:         for (j=0; j<nz; j++) {
1752:           col    = pj[j];
1753:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1754:         }
1755:         /* U part */
1756:         pc3 = b->a + bdiag[i+3]+1;
1757:         pj  = b->j + bdiag[i+3]+1;
1758:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1759:         for (j=0; j<nz; j++) {
1760:           col    = pj[j];
1761:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1762:         }

1764:         sctx.rs = rs;
1765:         sctx.pv = rtmp3[i+2];
1766:         MatPivotCheck(B,A,info,&sctx,i+2);
1767:         if (sctx.newshift) break;
1768:         pc3  = b->a + bdiag[i+2];
1769:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */

1771:         /* Now take care of 3rd column of diagonal 4x4 block. */
1772:         pc4 = rtmp4 + i+2;
1773:         if (*pc4 != 0.0) {
1774:           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1775:           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1776:           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1777:           for (j=0; j<nz; j++) {
1778:             col         = pj[j];
1779:             rtmp4[col] -= mul4 * rtmp3[col];
1780:           }
1781:           PetscLogFlops(1+2.0*nz);
1782:         }

1784:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1785:         rs = 0.0;
1786:         /* L part */
1787:         pc4 = b->a + bi[i+3];
1788:         pj  = b->j + bi[i+3];
1789:         nz  = bi[i+4] - bi[i+3];
1790:         for (j=0; j<nz; j++) {
1791:           col    = pj[j];
1792:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1793:         }
1794:         /* U part */
1795:         pc4 = b->a + bdiag[i+4]+1;
1796:         pj  = b->j + bdiag[i+4]+1;
1797:         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1798:         for (j=0; j<nz; j++) {
1799:           col    = pj[j];
1800:           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1801:         }

1803:         sctx.rs = rs;
1804:         sctx.pv = rtmp4[i+3];
1805:         MatPivotCheck(B,A,info,&sctx,i+3);
1806:         if (sctx.newshift) break;
1807:         pc4  = b->a + bdiag[i+3];
1808:         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1809:         break;

1811:       default:
1812:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1813:       }
1814:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1815:       i += nodesz;                 /* Update the row */
1816:     }

1818:     /* MatPivotRefine() */
1819:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1820:       /*
1821:        * if no shift in this attempt & shifting & started shifting & can refine,
1822:        * then try lower shift
1823:        */
1824:       sctx.shift_hi       = sctx.shift_fraction;
1825:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1826:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1827:       sctx.newshift       = PETSC_TRUE;
1828:       sctx.nshift++;
1829:     }
1830:   } while (sctx.newshift);

1832:   PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1833:   PetscFree(tmp_vec2);
1834:   ISRestoreIndices(isicol,&ic);
1835:   ISRestoreIndices(isrow,&r);

1837:   if (b->inode.size) {
1838:     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1839:   } else {
1840:     C->ops->solve           = MatSolve_SeqAIJ;
1841:   }
1842:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1843:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1844:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1845:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1846:   C->assembled              = PETSC_TRUE;
1847:   C->preallocated           = PETSC_TRUE;

1849:   PetscLogFlops(C->cmap->n);

1851:   /* MatShiftView(A,info,&sctx) */
1852:   if (sctx.nshift) {
1853:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1854:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1855:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1856:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1857:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1858:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1859:     }
1860:   }
1861:   return(0);
1862: }

1864: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1865: {
1866:   Mat             C     = B;
1867:   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1868:   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1869:   PetscErrorCode  ierr;
1870:   const PetscInt  *r,*ic,*c,*ics;
1871:   PetscInt        n   = A->rmap->n,*bi = b->i;
1872:   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1873:   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1874:   PetscInt        *ai = a->i,*aj = a->j;
1875:   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1876:   PetscScalar     mul1,mul2,mul3,tmp;
1877:   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1878:   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1879:   PetscReal       rs=0.0;
1880:   FactorShiftCtx  sctx;

1883:   sctx.shift_top      = 0;
1884:   sctx.nshift_max     = 0;
1885:   sctx.shift_lo       = 0;
1886:   sctx.shift_hi       = 0;
1887:   sctx.shift_fraction = 0;

1889:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1890:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1891:     sctx.shift_top = 0;
1892:     for (i=0; i<n; i++) {
1893:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1894:       rs    = 0.0;
1895:       ajtmp = aj + ai[i];
1896:       rtmp1 = aa + ai[i];
1897:       nz    = ai[i+1] - ai[i];
1898:       for (j=0; j<nz; j++) {
1899:         if (*ajtmp != i) {
1900:           rs += PetscAbsScalar(*rtmp1++);
1901:         } else {
1902:           rs -= PetscRealPart(*rtmp1++);
1903:         }
1904:         ajtmp++;
1905:       }
1906:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1907:     }
1908:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1909:     sctx.shift_top *= 1.1;
1910:     sctx.nshift_max = 5;
1911:     sctx.shift_lo   = 0.;
1912:     sctx.shift_hi   = 1.;
1913:   }
1914:   sctx.shift_amount = 0;
1915:   sctx.nshift       = 0;

1917:   ISGetIndices(isrow,&r);
1918:   ISGetIndices(iscol,&c);
1919:   ISGetIndices(isicol,&ic);
1920:   PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1921:   ics    = ic;

1923:   node_max = a->inode.node_count;
1924:   ns       = a->inode.size;
1925:   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");

1927:   /* If max inode size > 3, split it into two inodes.*/
1928:   /* also map the inode sizes according to the ordering */
1929:   PetscMalloc1(n+1,&tmp_vec1);
1930:   for (i=0,j=0; i<node_max; ++i,++j) {
1931:     if (ns[i]>3) {
1932:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1933:       ++j;
1934:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1935:     } else {
1936:       tmp_vec1[j] = ns[i];
1937:     }
1938:   }
1939:   /* Use the correct node_max */
1940:   node_max = j;

1942:   /* Now reorder the inode info based on mat re-ordering info */
1943:   /* First create a row -> inode_size_array_index map */
1944:   PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);
1945:   for (i=0,row=0; i<node_max; i++) {
1946:     nodesz = tmp_vec1[i];
1947:     for (j=0; j<nodesz; j++,row++) {
1948:       nsmap[row] = i;
1949:     }
1950:   }
1951:   /* Using nsmap, create a reordered ns structure */
1952:   for (i=0,j=0; i< node_max; i++) {
1953:     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1954:     tmp_vec2[i] = nodesz;
1955:     j          += nodesz;
1956:   }
1957:   PetscFree2(nsmap,tmp_vec1);
1958:   /* Now use the correct ns */
1959:   ns = tmp_vec2;

1961:   do {
1962:     sctx.newshift = PETSC_FALSE;
1963:     /* Now loop over each block-row, and do the factorization */
1964:     for (i=0,row=0; i<node_max; i++) {
1965:       nodesz = ns[i];
1966:       nz     = bi[row+1] - bi[row];
1967:       bjtmp  = bj + bi[row];

1969:       switch (nodesz) {
1970:       case 1:
1971:         for  (j=0; j<nz; j++) {
1972:           idx         = bjtmp[j];
1973:           rtmp11[idx] = 0.0;
1974:         }

1976:         /* load in initial (unfactored row) */
1977:         idx    = r[row];
1978:         nz_tmp = ai[idx+1] - ai[idx];
1979:         ajtmp  = aj + ai[idx];
1980:         v1     = aa + ai[idx];

1982:         for (j=0; j<nz_tmp; j++) {
1983:           idx         = ics[ajtmp[j]];
1984:           rtmp11[idx] = v1[j];
1985:         }
1986:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1988:         prow = *bjtmp++;
1989:         while (prow < row) {
1990:           pc1 = rtmp11 + prow;
1991:           if (*pc1 != 0.0) {
1992:             pv     = ba + bd[prow];
1993:             pj     = nbj + bd[prow];
1994:             mul1   = *pc1 * *pv++;
1995:             *pc1   = mul1;
1996:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1997:             PetscLogFlops(1+2.0*nz_tmp);
1998:             for (j=0; j<nz_tmp; j++) {
1999:               tmp          = pv[j];
2000:               idx          = pj[j];
2001:               rtmp11[idx] -= mul1 * tmp;
2002:             }
2003:           }
2004:           prow = *bjtmp++;
2005:         }
2006:         pj  = bj + bi[row];
2007:         pc1 = ba + bi[row];

2009:         sctx.pv     = rtmp11[row];
2010:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2011:         rs          = 0.0;
2012:         for (j=0; j<nz; j++) {
2013:           idx    = pj[j];
2014:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2015:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2016:         }
2017:         sctx.rs = rs;
2018:         MatPivotCheck(B,A,info,&sctx,row);
2019:         if (sctx.newshift) goto endofwhile;
2020:         break;

2022:       case 2:
2023:         for (j=0; j<nz; j++) {
2024:           idx         = bjtmp[j];
2025:           rtmp11[idx] = 0.0;
2026:           rtmp22[idx] = 0.0;
2027:         }

2029:         /* load in initial (unfactored row) */
2030:         idx    = r[row];
2031:         nz_tmp = ai[idx+1] - ai[idx];
2032:         ajtmp  = aj + ai[idx];
2033:         v1     = aa + ai[idx];
2034:         v2     = aa + ai[idx+1];
2035:         for (j=0; j<nz_tmp; j++) {
2036:           idx         = ics[ajtmp[j]];
2037:           rtmp11[idx] = v1[j];
2038:           rtmp22[idx] = v2[j];
2039:         }
2040:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2041:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

2043:         prow = *bjtmp++;
2044:         while (prow < row) {
2045:           pc1 = rtmp11 + prow;
2046:           pc2 = rtmp22 + prow;
2047:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2048:             pv   = ba + bd[prow];
2049:             pj   = nbj + bd[prow];
2050:             mul1 = *pc1 * *pv;
2051:             mul2 = *pc2 * *pv;
2052:             ++pv;
2053:             *pc1 = mul1;
2054:             *pc2 = mul2;

2056:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2057:             for (j=0; j<nz_tmp; j++) {
2058:               tmp          = pv[j];
2059:               idx          = pj[j];
2060:               rtmp11[idx] -= mul1 * tmp;
2061:               rtmp22[idx] -= mul2 * tmp;
2062:             }
2063:             PetscLogFlops(2+4.0*nz_tmp);
2064:           }
2065:           prow = *bjtmp++;
2066:         }

2068:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2069:         pc1 = rtmp11 + prow;
2070:         pc2 = rtmp22 + prow;

2072:         sctx.pv = *pc1;
2073:         pj      = bj + bi[prow];
2074:         rs      = 0.0;
2075:         for (j=0; j<nz; j++) {
2076:           idx = pj[j];
2077:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2078:         }
2079:         sctx.rs = rs;
2080:         MatPivotCheck(B,A,info,&sctx,row);
2081:         if (sctx.newshift) goto endofwhile;

2083:         if (*pc2 != 0.0) {
2084:           pj     = nbj + bd[prow];
2085:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2086:           *pc2   = mul2;
2087:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2088:           for (j=0; j<nz_tmp; j++) {
2089:             idx          = pj[j];
2090:             tmp          = rtmp11[idx];
2091:             rtmp22[idx] -= mul2 * tmp;
2092:           }
2093:           PetscLogFlops(1+2.0*nz_tmp);
2094:         }

2096:         pj  = bj + bi[row];
2097:         pc1 = ba + bi[row];
2098:         pc2 = ba + bi[row+1];

2100:         sctx.pv       = rtmp22[row+1];
2101:         rs            = 0.0;
2102:         rtmp11[row]   = 1.0/rtmp11[row];
2103:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2104:         /* copy row entries from dense representation to sparse */
2105:         for (j=0; j<nz; j++) {
2106:           idx    = pj[j];
2107:           pc1[j] = rtmp11[idx];
2108:           pc2[j] = rtmp22[idx];
2109:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2110:         }
2111:         sctx.rs = rs;
2112:         MatPivotCheck(B,A,info,&sctx,row+1);
2113:         if (sctx.newshift) goto endofwhile;
2114:         break;

2116:       case 3:
2117:         for  (j=0; j<nz; j++) {
2118:           idx         = bjtmp[j];
2119:           rtmp11[idx] = 0.0;
2120:           rtmp22[idx] = 0.0;
2121:           rtmp33[idx] = 0.0;
2122:         }
2123:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2124:         idx    = r[row];
2125:         nz_tmp = ai[idx+1] - ai[idx];
2126:         ajtmp  = aj + ai[idx];
2127:         v1     = aa + ai[idx];
2128:         v2     = aa + ai[idx+1];
2129:         v3     = aa + ai[idx+2];
2130:         for (j=0; j<nz_tmp; j++) {
2131:           idx         = ics[ajtmp[j]];
2132:           rtmp11[idx] = v1[j];
2133:           rtmp22[idx] = v2[j];
2134:           rtmp33[idx] = v3[j];
2135:         }
2136:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2137:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2138:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

2140:         /* loop over all pivot row blocks above this row block */
2141:         prow = *bjtmp++;
2142:         while (prow < row) {
2143:           pc1 = rtmp11 + prow;
2144:           pc2 = rtmp22 + prow;
2145:           pc3 = rtmp33 + prow;
2146:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2147:             pv   = ba  + bd[prow];
2148:             pj   = nbj + bd[prow];
2149:             mul1 = *pc1 * *pv;
2150:             mul2 = *pc2 * *pv;
2151:             mul3 = *pc3 * *pv;
2152:             ++pv;
2153:             *pc1 = mul1;
2154:             *pc2 = mul2;
2155:             *pc3 = mul3;

2157:             nz_tmp = bi[prow+1] - bd[prow] - 1;
2158:             /* update this row based on pivot row */
2159:             for (j=0; j<nz_tmp; j++) {
2160:               tmp          = pv[j];
2161:               idx          = pj[j];
2162:               rtmp11[idx] -= mul1 * tmp;
2163:               rtmp22[idx] -= mul2 * tmp;
2164:               rtmp33[idx] -= mul3 * tmp;
2165:             }
2166:             PetscLogFlops(3+6.0*nz_tmp);
2167:           }
2168:           prow = *bjtmp++;
2169:         }

2171:         /* Now take care of diagonal 3x3 block in this set of rows */
2172:         /* note: prow = row here */
2173:         pc1 = rtmp11 + prow;
2174:         pc2 = rtmp22 + prow;
2175:         pc3 = rtmp33 + prow;

2177:         sctx.pv = *pc1;
2178:         pj      = bj + bi[prow];
2179:         rs      = 0.0;
2180:         for (j=0; j<nz; j++) {
2181:           idx = pj[j];
2182:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2183:         }
2184:         sctx.rs = rs;
2185:         MatPivotCheck(B,A,info,&sctx,row);
2186:         if (sctx.newshift) goto endofwhile;

2188:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2189:           mul2   = (*pc2)/(*pc1);
2190:           mul3   = (*pc3)/(*pc1);
2191:           *pc2   = mul2;
2192:           *pc3   = mul3;
2193:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2194:           pj     = nbj + bd[prow];
2195:           for (j=0; j<nz_tmp; j++) {
2196:             idx          = pj[j];
2197:             tmp          = rtmp11[idx];
2198:             rtmp22[idx] -= mul2 * tmp;
2199:             rtmp33[idx] -= mul3 * tmp;
2200:           }
2201:           PetscLogFlops(2+4.0*nz_tmp);
2202:         }
2203:         ++prow;

2205:         pc2     = rtmp22 + prow;
2206:         pc3     = rtmp33 + prow;
2207:         sctx.pv = *pc2;
2208:         pj      = bj + bi[prow];
2209:         rs      = 0.0;
2210:         for (j=0; j<nz; j++) {
2211:           idx = pj[j];
2212:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2213:         }
2214:         sctx.rs = rs;
2215:         MatPivotCheck(B,A,info,&sctx,row+1);
2216:         if (sctx.newshift) goto endofwhile;

2218:         if (*pc3 != 0.0) {
2219:           mul3   = (*pc3)/(*pc2);
2220:           *pc3   = mul3;
2221:           pj     = nbj + bd[prow];
2222:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2223:           for (j=0; j<nz_tmp; j++) {
2224:             idx          = pj[j];
2225:             tmp          = rtmp22[idx];
2226:             rtmp33[idx] -= mul3 * tmp;
2227:           }
2228:           PetscLogFlops(1+2.0*nz_tmp);
2229:         }

2231:         pj  = bj + bi[row];
2232:         pc1 = ba + bi[row];
2233:         pc2 = ba + bi[row+1];
2234:         pc3 = ba + bi[row+2];

2236:         sctx.pv       = rtmp33[row+2];
2237:         rs            = 0.0;
2238:         rtmp11[row]   = 1.0/rtmp11[row];
2239:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2240:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2241:         /* copy row entries from dense representation to sparse */
2242:         for (j=0; j<nz; j++) {
2243:           idx    = pj[j];
2244:           pc1[j] = rtmp11[idx];
2245:           pc2[j] = rtmp22[idx];
2246:           pc3[j] = rtmp33[idx];
2247:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2248:         }

2250:         sctx.rs = rs;
2251:         MatPivotCheck(B,A,info,&sctx,row+2);
2252:         if (sctx.newshift) goto endofwhile;
2253:         break;

2255:       default:
2256:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2257:       }
2258:       row += nodesz;                 /* Update the row */
2259:     }
2260: endofwhile:;
2261:   } while (sctx.newshift);
2262:   PetscFree3(rtmp11,rtmp22,rtmp33);
2263:   PetscFree(tmp_vec2);
2264:   ISRestoreIndices(isicol,&ic);
2265:   ISRestoreIndices(isrow,&r);
2266:   ISRestoreIndices(iscol,&c);

2268:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2269:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2270:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2271:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2272:   C->assembled              = PETSC_TRUE;
2273:   C->preallocated           = PETSC_TRUE;
2274:   if (sctx.nshift) {
2275:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2276:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2277:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2278:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2279:     }
2280:   }
2281:   PetscLogFlops(C->cmap->n);
2282:   MatSeqAIJCheckInode(C);
2283:   return(0);
2284: }

2286: /* ----------------------------------------------------------- */
2287: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2288: {
2289:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2290:   IS                iscol = a->col,isrow = a->row;
2291:   PetscErrorCode    ierr;
2292:   const PetscInt    *r,*c,*rout,*cout;
2293:   PetscInt          i,j,n = A->rmap->n;
2294:   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2295:   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2296:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2297:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2298:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2299:   const PetscScalar *b;

2302:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2303:   node_max = a->inode.node_count;
2304:   ns       = a->inode.size;     /* Node Size array */

2306:   VecGetArrayRead(bb,&b);
2307:   VecGetArrayWrite(xx,&x);
2308:   tmp  = a->solve_work;

2310:   ISGetIndices(isrow,&rout); r = rout;
2311:   ISGetIndices(iscol,&cout); c = cout;

2313:   /* forward solve the lower triangular */
2314:   tmps = tmp;
2315:   aa   = a_a;
2316:   aj   = a_j;
2317:   ad   = a->diag;

2319:   for (i = 0,row = 0; i< node_max; ++i) {
2320:     nsz = ns[i];
2321:     aii = ai[row];
2322:     v1  = aa + aii;
2323:     vi  = aj + aii;
2324:     nz  = ai[row+1]- ai[row];

2326:     if (i < node_max-1) {
2327:       /* Prefetch the indices for the next block */
2328:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2329:       /* Prefetch the data for the next block */
2330:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2331:     }

2333:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2334:     case 1:
2335:       sum1 = b[r[row]];
2336:       for (j=0; j<nz-1; j+=2) {
2337:         i0    = vi[j];
2338:         i1    = vi[j+1];
2339:         tmp0  = tmps[i0];
2340:         tmp1  = tmps[i1];
2341:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2342:       }
2343:       if (j == nz-1) {
2344:         tmp0  = tmps[vi[j]];
2345:         sum1 -= v1[j]*tmp0;
2346:       }
2347:       tmp[row++]=sum1;
2348:       break;
2349:     case 2:
2350:       sum1 = b[r[row]];
2351:       sum2 = b[r[row+1]];
2352:       v2   = aa + ai[row+1];

2354:       for (j=0; j<nz-1; j+=2) {
2355:         i0    = vi[j];
2356:         i1    = vi[j+1];
2357:         tmp0  = tmps[i0];
2358:         tmp1  = tmps[i1];
2359:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2360:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2361:       }
2362:       if (j == nz-1) {
2363:         tmp0  = tmps[vi[j]];
2364:         sum1 -= v1[j] *tmp0;
2365:         sum2 -= v2[j] *tmp0;
2366:       }
2367:       sum2     -= v2[nz] * sum1;
2368:       tmp[row++]=sum1;
2369:       tmp[row++]=sum2;
2370:       break;
2371:     case 3:
2372:       sum1 = b[r[row]];
2373:       sum2 = b[r[row+1]];
2374:       sum3 = b[r[row+2]];
2375:       v2   = aa + ai[row+1];
2376:       v3   = aa + ai[row+2];

2378:       for (j=0; j<nz-1; j+=2) {
2379:         i0    = vi[j];
2380:         i1    = vi[j+1];
2381:         tmp0  = tmps[i0];
2382:         tmp1  = tmps[i1];
2383:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2384:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2385:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2386:       }
2387:       if (j == nz-1) {
2388:         tmp0  = tmps[vi[j]];
2389:         sum1 -= v1[j] *tmp0;
2390:         sum2 -= v2[j] *tmp0;
2391:         sum3 -= v3[j] *tmp0;
2392:       }
2393:       sum2     -= v2[nz] * sum1;
2394:       sum3     -= v3[nz] * sum1;
2395:       sum3     -= v3[nz+1] * sum2;
2396:       tmp[row++]=sum1;
2397:       tmp[row++]=sum2;
2398:       tmp[row++]=sum3;
2399:       break;

2401:     case 4:
2402:       sum1 = b[r[row]];
2403:       sum2 = b[r[row+1]];
2404:       sum3 = b[r[row+2]];
2405:       sum4 = b[r[row+3]];
2406:       v2   = aa + ai[row+1];
2407:       v3   = aa + ai[row+2];
2408:       v4   = aa + ai[row+3];

2410:       for (j=0; j<nz-1; j+=2) {
2411:         i0    = vi[j];
2412:         i1    = vi[j+1];
2413:         tmp0  = tmps[i0];
2414:         tmp1  = tmps[i1];
2415:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2416:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2417:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2418:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2419:       }
2420:       if (j == nz-1) {
2421:         tmp0  = tmps[vi[j]];
2422:         sum1 -= v1[j] *tmp0;
2423:         sum2 -= v2[j] *tmp0;
2424:         sum3 -= v3[j] *tmp0;
2425:         sum4 -= v4[j] *tmp0;
2426:       }
2427:       sum2 -= v2[nz] * sum1;
2428:       sum3 -= v3[nz] * sum1;
2429:       sum4 -= v4[nz] * sum1;
2430:       sum3 -= v3[nz+1] * sum2;
2431:       sum4 -= v4[nz+1] * sum2;
2432:       sum4 -= v4[nz+2] * sum3;

2434:       tmp[row++]=sum1;
2435:       tmp[row++]=sum2;
2436:       tmp[row++]=sum3;
2437:       tmp[row++]=sum4;
2438:       break;
2439:     case 5:
2440:       sum1 = b[r[row]];
2441:       sum2 = b[r[row+1]];
2442:       sum3 = b[r[row+2]];
2443:       sum4 = b[r[row+3]];
2444:       sum5 = b[r[row+4]];
2445:       v2   = aa + ai[row+1];
2446:       v3   = aa + ai[row+2];
2447:       v4   = aa + ai[row+3];
2448:       v5   = aa + ai[row+4];

2450:       for (j=0; j<nz-1; j+=2) {
2451:         i0    = vi[j];
2452:         i1    = vi[j+1];
2453:         tmp0  = tmps[i0];
2454:         tmp1  = tmps[i1];
2455:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2456:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2457:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2458:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2459:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2460:       }
2461:       if (j == nz-1) {
2462:         tmp0  = tmps[vi[j]];
2463:         sum1 -= v1[j] *tmp0;
2464:         sum2 -= v2[j] *tmp0;
2465:         sum3 -= v3[j] *tmp0;
2466:         sum4 -= v4[j] *tmp0;
2467:         sum5 -= v5[j] *tmp0;
2468:       }

2470:       sum2 -= v2[nz] * sum1;
2471:       sum3 -= v3[nz] * sum1;
2472:       sum4 -= v4[nz] * sum1;
2473:       sum5 -= v5[nz] * sum1;
2474:       sum3 -= v3[nz+1] * sum2;
2475:       sum4 -= v4[nz+1] * sum2;
2476:       sum5 -= v5[nz+1] * sum2;
2477:       sum4 -= v4[nz+2] * sum3;
2478:       sum5 -= v5[nz+2] * sum3;
2479:       sum5 -= v5[nz+3] * sum4;

2481:       tmp[row++]=sum1;
2482:       tmp[row++]=sum2;
2483:       tmp[row++]=sum3;
2484:       tmp[row++]=sum4;
2485:       tmp[row++]=sum5;
2486:       break;
2487:     default:
2488:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2489:     }
2490:   }
2491:   /* backward solve the upper triangular */
2492:   for (i=node_max -1,row = n-1; i>=0; i--) {
2493:     nsz = ns[i];
2494:     aii = ad[row+1] + 1;
2495:     v1  = aa + aii;
2496:     vi  = aj + aii;
2497:     nz  = ad[row]- ad[row+1] - 1;

2499:     if (i > 0) {
2500:       /* Prefetch the indices for the next block */
2501:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2502:       /* Prefetch the data for the next block */
2503:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504:     }

2506:     switch (nsz) {               /* Each loop in 'case' is unrolled */
2507:     case 1:
2508:       sum1 = tmp[row];

2510:       for (j=0; j<nz-1; j+=2) {
2511:         i0    = vi[j];
2512:         i1    = vi[j+1];
2513:         tmp0  = tmps[i0];
2514:         tmp1  = tmps[i1];
2515:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2516:       }
2517:       if (j == nz-1) {
2518:         tmp0  = tmps[vi[j]];
2519:         sum1 -= v1[j]*tmp0;
2520:       }
2521:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2522:       break;
2523:     case 2:
2524:       sum1 = tmp[row];
2525:       sum2 = tmp[row-1];
2526:       v2   = aa + ad[row] + 1;
2527:       for (j=0; j<nz-1; j+=2) {
2528:         i0    = vi[j];
2529:         i1    = vi[j+1];
2530:         tmp0  = tmps[i0];
2531:         tmp1  = tmps[i1];
2532:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2533:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2534:       }
2535:       if (j == nz-1) {
2536:         tmp0  = tmps[vi[j]];
2537:         sum1 -= v1[j]* tmp0;
2538:         sum2 -= v2[j+1]* tmp0;
2539:       }

2541:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2542:       sum2     -= v2[0] * tmp0;
2543:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2544:       break;
2545:     case 3:
2546:       sum1 = tmp[row];
2547:       sum2 = tmp[row -1];
2548:       sum3 = tmp[row -2];
2549:       v2   = aa + ad[row] + 1;
2550:       v3   = aa + ad[row -1] + 1;
2551:       for (j=0; j<nz-1; j+=2) {
2552:         i0    = vi[j];
2553:         i1    = vi[j+1];
2554:         tmp0  = tmps[i0];
2555:         tmp1  = tmps[i1];
2556:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2558:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2559:       }
2560:       if (j== nz-1) {
2561:         tmp0  = tmps[vi[j]];
2562:         sum1 -= v1[j] * tmp0;
2563:         sum2 -= v2[j+1] * tmp0;
2564:         sum3 -= v3[j+2] * tmp0;
2565:       }
2566:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2567:       sum2     -= v2[0]* tmp0;
2568:       sum3     -= v3[1] * tmp0;
2569:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2570:       sum3     -= v3[0]* tmp0;
2571:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;

2573:       break;
2574:     case 4:
2575:       sum1 = tmp[row];
2576:       sum2 = tmp[row -1];
2577:       sum3 = tmp[row -2];
2578:       sum4 = tmp[row -3];
2579:       v2   = aa + ad[row]+1;
2580:       v3   = aa + ad[row -1]+1;
2581:       v4   = aa + ad[row -2]+1;

2583:       for (j=0; j<nz-1; j+=2) {
2584:         i0    = vi[j];
2585:         i1    = vi[j+1];
2586:         tmp0  = tmps[i0];
2587:         tmp1  = tmps[i1];
2588:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2589:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2590:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2591:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2592:       }
2593:       if (j== nz-1) {
2594:         tmp0  = tmps[vi[j]];
2595:         sum1 -= v1[j] * tmp0;
2596:         sum2 -= v2[j+1] * tmp0;
2597:         sum3 -= v3[j+2] * tmp0;
2598:         sum4 -= v4[j+3] * tmp0;
2599:       }

2601:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2602:       sum2     -= v2[0] * tmp0;
2603:       sum3     -= v3[1] * tmp0;
2604:       sum4     -= v4[2] * tmp0;
2605:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2606:       sum3     -= v3[0] * tmp0;
2607:       sum4     -= v4[1] * tmp0;
2608:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2609:       sum4     -= v4[0] * tmp0;
2610:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2611:       break;
2612:     case 5:
2613:       sum1 = tmp[row];
2614:       sum2 = tmp[row -1];
2615:       sum3 = tmp[row -2];
2616:       sum4 = tmp[row -3];
2617:       sum5 = tmp[row -4];
2618:       v2   = aa + ad[row]+1;
2619:       v3   = aa + ad[row -1]+1;
2620:       v4   = aa + ad[row -2]+1;
2621:       v5   = aa + ad[row -3]+1;
2622:       for (j=0; j<nz-1; j+=2) {
2623:         i0    = vi[j];
2624:         i1    = vi[j+1];
2625:         tmp0  = tmps[i0];
2626:         tmp1  = tmps[i1];
2627:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2628:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2629:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2630:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2631:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2632:       }
2633:       if (j==nz-1) {
2634:         tmp0  = tmps[vi[j]];
2635:         sum1 -= v1[j] * tmp0;
2636:         sum2 -= v2[j+1] * tmp0;
2637:         sum3 -= v3[j+2] * tmp0;
2638:         sum4 -= v4[j+3] * tmp0;
2639:         sum5 -= v5[j+4] * tmp0;
2640:       }

2642:       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643:       sum2     -= v2[0] * tmp0;
2644:       sum3     -= v3[1] * tmp0;
2645:       sum4     -= v4[2] * tmp0;
2646:       sum5     -= v5[3] * tmp0;
2647:       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2648:       sum3     -= v3[0] * tmp0;
2649:       sum4     -= v4[1] * tmp0;
2650:       sum5     -= v5[2] * tmp0;
2651:       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652:       sum4     -= v4[0] * tmp0;
2653:       sum5     -= v5[1] * tmp0;
2654:       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655:       sum5     -= v5[0] * tmp0;
2656:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2657:       break;
2658:     default:
2659:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2660:     }
2661:   }
2662:   ISRestoreIndices(isrow,&rout);
2663:   ISRestoreIndices(iscol,&cout);
2664:   VecRestoreArrayRead(bb,&b);
2665:   VecRestoreArrayWrite(xx,&x);
2666:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2667:   return(0);
2668: }

2670: /*
2671:      Makes a longer coloring[] array and calls the usual code with that
2672: */
2673: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2674: {
2675:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2676:   PetscErrorCode  ierr;
2677:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2678:   PetscInt        *colorused,i;
2679:   ISColoringValue *newcolor;

2682:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2683:   PetscMalloc1(n+1,&newcolor);
2684:   /* loop over inodes, marking a color for each column*/
2685:   row = 0;
2686:   for (i=0; i<m; i++) {
2687:     for (j=0; j<ns[i]; j++) {
2688:       newcolor[row++] = coloring[i] + j*ncolors;
2689:     }
2690:   }

2692:   /* eliminate unneeded colors */
2693:   PetscCalloc1(5*ncolors,&colorused);
2694:   for (i=0; i<n; i++) {
2695:     colorused[newcolor[i]] = 1;
2696:   }

2698:   for (i=1; i<5*ncolors; i++) {
2699:     colorused[i] += colorused[i-1];
2700:   }
2701:   ncolors = colorused[5*ncolors-1];
2702:   for (i=0; i<n; i++) {
2703:     newcolor[i] = colorused[newcolor[i]]-1;
2704:   }
2705:   PetscFree(colorused);
2706:   ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2707:   PetscFree(coloring);
2708:   return(0);
2709: }

2711: #include <petsc/private/kernels/blockinvert.h>

2713: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2714: {
2715:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2716:   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2717:   MatScalar         *ibdiag,*bdiag,work[25],*t;
2718:   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2719:   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2720:   const PetscScalar *xb, *b;
2721:   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2722:   PetscErrorCode    ierr;
2723:   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2724:   PetscInt          sz,k,ipvt[5];
2725:   PetscBool         allowzeropivot,zeropivotdetected;
2726:   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;

2729:   allowzeropivot = PetscNot(A->erroriffailure);
2730:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2731:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2732:   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");

2734:   if (!a->inode.ibdiagvalid) {
2735:     if (!a->inode.ibdiag) {
2736:       /* calculate space needed for diagonal blocks */
2737:       for (i=0; i<m; i++) {
2738:         cnt += sizes[i]*sizes[i];
2739:       }
2740:       a->inode.bdiagsize = cnt;

2742:       PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2743:     }

2745:     /* copy over the diagonal blocks and invert them */
2746:     ibdiag = a->inode.ibdiag;
2747:     bdiag  = a->inode.bdiag;
2748:     cnt    = 0;
2749:     for (i=0, row = 0; i<m; i++) {
2750:       for (j=0; j<sizes[i]; j++) {
2751:         for (k=0; k<sizes[i]; k++) {
2752:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2753:         }
2754:       }
2755:       PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);

2757:       switch (sizes[i]) {
2758:       case 1:
2759:         /* Create matrix data structure */
2760:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2761:           if (allowzeropivot) {
2762:             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2763:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2764:             A->factorerror_zeropivot_row   = row;
2765:             PetscInfo1(A,"Zero pivot, row %D\n",row);
2766:           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2767:         }
2768:         ibdiag[cnt] = 1.0/ibdiag[cnt];
2769:         break;
2770:       case 2:
2771:         PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2772:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2773:         break;
2774:       case 3:
2775:         PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2776:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2777:         break;
2778:       case 4:
2779:         PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2780:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2781:         break;
2782:       case 5:
2783:         PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2784:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2785:         break;
2786:       default:
2787:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2788:       }
2789:       cnt += sizes[i]*sizes[i];
2790:       row += sizes[i];
2791:     }
2792:     a->inode.ibdiagvalid = PETSC_TRUE;
2793:   }
2794:   ibdiag = a->inode.ibdiag;
2795:   bdiag  = a->inode.bdiag;
2796:   t      = a->inode.ssor_work;

2798:   VecGetArray(xx,&x);
2799:   VecGetArrayRead(bb,&b);
2800:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2801:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2802:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {

2804:       for (i=0, row=0; i<m; i++) {
2805:         sz  = diag[row] - ii[row];
2806:         v1  = a->a + ii[row];
2807:         idx = a->j + ii[row];

2809:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2810:         switch (sizes[i]) {
2811:         case 1:

2813:           sum1 = b[row];
2814:           for (n = 0; n<sz-1; n+=2) {
2815:             i1    = idx[0];
2816:             i2    = idx[1];
2817:             idx  += 2;
2818:             tmp0  = x[i1];
2819:             tmp1  = x[i2];
2820:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2821:           }

2823:           if (n == sz-1) {
2824:             tmp0  = x[*idx];
2825:             sum1 -= *v1 * tmp0;
2826:           }
2827:           t[row]   = sum1;
2828:           x[row++] = sum1*(*ibdiag++);
2829:           break;
2830:         case 2:
2831:           v2   = a->a + ii[row+1];
2832:           sum1 = b[row];
2833:           sum2 = b[row+1];
2834:           for (n = 0; n<sz-1; n+=2) {
2835:             i1    = idx[0];
2836:             i2    = idx[1];
2837:             idx  += 2;
2838:             tmp0  = x[i1];
2839:             tmp1  = x[i2];
2840:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2841:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2842:           }

2844:           if (n == sz-1) {
2845:             tmp0  = x[*idx];
2846:             sum1 -= v1[0] * tmp0;
2847:             sum2 -= v2[0] * tmp0;
2848:           }
2849:           t[row]   = sum1;
2850:           t[row+1] = sum2;
2851:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2852:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2853:           ibdiag  += 4;
2854:           break;
2855:         case 3:
2856:           v2   = a->a + ii[row+1];
2857:           v3   = a->a + ii[row+2];
2858:           sum1 = b[row];
2859:           sum2 = b[row+1];
2860:           sum3 = b[row+2];
2861:           for (n = 0; n<sz-1; n+=2) {
2862:             i1    = idx[0];
2863:             i2    = idx[1];
2864:             idx  += 2;
2865:             tmp0  = x[i1];
2866:             tmp1  = x[i2];
2867:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2868:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2869:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2870:           }

2872:           if (n == sz-1) {
2873:             tmp0  = x[*idx];
2874:             sum1 -= v1[0] * tmp0;
2875:             sum2 -= v2[0] * tmp0;
2876:             sum3 -= v3[0] * tmp0;
2877:           }
2878:           t[row]   = sum1;
2879:           t[row+1] = sum2;
2880:           t[row+2] = sum3;
2881:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2882:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2883:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2884:           ibdiag  += 9;
2885:           break;
2886:         case 4:
2887:           v2   = a->a + ii[row+1];
2888:           v3   = a->a + ii[row+2];
2889:           v4   = a->a + ii[row+3];
2890:           sum1 = b[row];
2891:           sum2 = b[row+1];
2892:           sum3 = b[row+2];
2893:           sum4 = b[row+3];
2894:           for (n = 0; n<sz-1; n+=2) {
2895:             i1    = idx[0];
2896:             i2    = idx[1];
2897:             idx  += 2;
2898:             tmp0  = x[i1];
2899:             tmp1  = x[i2];
2900:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2904:           }

2906:           if (n == sz-1) {
2907:             tmp0  = x[*idx];
2908:             sum1 -= v1[0] * tmp0;
2909:             sum2 -= v2[0] * tmp0;
2910:             sum3 -= v3[0] * tmp0;
2911:             sum4 -= v4[0] * tmp0;
2912:           }
2913:           t[row]   = sum1;
2914:           t[row+1] = sum2;
2915:           t[row+2] = sum3;
2916:           t[row+3] = sum4;
2917:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2918:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2919:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2920:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2921:           ibdiag  += 16;
2922:           break;
2923:         case 5:
2924:           v2   = a->a + ii[row+1];
2925:           v3   = a->a + ii[row+2];
2926:           v4   = a->a + ii[row+3];
2927:           v5   = a->a + ii[row+4];
2928:           sum1 = b[row];
2929:           sum2 = b[row+1];
2930:           sum3 = b[row+2];
2931:           sum4 = b[row+3];
2932:           sum5 = b[row+4];
2933:           for (n = 0; n<sz-1; n+=2) {
2934:             i1    = idx[0];
2935:             i2    = idx[1];
2936:             idx  += 2;
2937:             tmp0  = x[i1];
2938:             tmp1  = x[i2];
2939:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2944:           }

2946:           if (n == sz-1) {
2947:             tmp0  = x[*idx];
2948:             sum1 -= v1[0] * tmp0;
2949:             sum2 -= v2[0] * tmp0;
2950:             sum3 -= v3[0] * tmp0;
2951:             sum4 -= v4[0] * tmp0;
2952:             sum5 -= v5[0] * tmp0;
2953:           }
2954:           t[row]   = sum1;
2955:           t[row+1] = sum2;
2956:           t[row+2] = sum3;
2957:           t[row+3] = sum4;
2958:           t[row+4] = sum5;
2959:           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2960:           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2961:           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2962:           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2963:           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2964:           ibdiag  += 25;
2965:           break;
2966:         default:
2967:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2968:         }
2969:       }

2971:       xb   = t;
2972:       PetscLogFlops(a->nz);
2973:     } else xb = b;
2974:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

2976:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2977:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2978:         ibdiag -= sizes[i]*sizes[i];
2979:         sz      = ii[row+1] - diag[row] - 1;
2980:         v1      = a->a + diag[row] + 1;
2981:         idx     = a->j + diag[row] + 1;

2983:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2984:         switch (sizes[i]) {
2985:         case 1:

2987:           sum1 = xb[row];
2988:           for (n = 0; n<sz-1; n+=2) {
2989:             i1    = idx[0];
2990:             i2    = idx[1];
2991:             idx  += 2;
2992:             tmp0  = x[i1];
2993:             tmp1  = x[i2];
2994:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2995:           }

2997:           if (n == sz-1) {
2998:             tmp0  = x[*idx];
2999:             sum1 -= *v1*tmp0;
3000:           }
3001:           x[row--] = sum1*(*ibdiag);
3002:           break;

3004:         case 2:

3006:           sum1 = xb[row];
3007:           sum2 = xb[row-1];
3008:           /* note that sum1 is associated with the second of the two rows */
3009:           v2 = a->a + diag[row-1] + 2;
3010:           for (n = 0; n<sz-1; n+=2) {
3011:             i1    = idx[0];
3012:             i2    = idx[1];
3013:             idx  += 2;
3014:             tmp0  = x[i1];
3015:             tmp1  = x[i2];
3016:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3017:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3018:           }

3020:           if (n == sz-1) {
3021:             tmp0  = x[*idx];
3022:             sum1 -= *v1*tmp0;
3023:             sum2 -= *v2*tmp0;
3024:           }
3025:           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3026:           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3027:           break;
3028:         case 3:

3030:           sum1 = xb[row];
3031:           sum2 = xb[row-1];
3032:           sum3 = xb[row-2];
3033:           v2   = a->a + diag[row-1] + 2;
3034:           v3   = a->a + diag[row-2] + 3;
3035:           for (n = 0; n<sz-1; n+=2) {
3036:             i1    = idx[0];
3037:             i2    = idx[1];
3038:             idx  += 2;
3039:             tmp0  = x[i1];
3040:             tmp1  = x[i2];
3041:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3042:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3043:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3044:           }

3046:           if (n == sz-1) {
3047:             tmp0  = x[*idx];
3048:             sum1 -= *v1*tmp0;
3049:             sum2 -= *v2*tmp0;
3050:             sum3 -= *v3*tmp0;
3051:           }
3052:           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3053:           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3054:           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3055:           break;
3056:         case 4:

3058:           sum1 = xb[row];
3059:           sum2 = xb[row-1];
3060:           sum3 = xb[row-2];
3061:           sum4 = xb[row-3];
3062:           v2   = a->a + diag[row-1] + 2;
3063:           v3   = a->a + diag[row-2] + 3;
3064:           v4   = a->a + diag[row-3] + 4;
3065:           for (n = 0; n<sz-1; n+=2) {
3066:             i1    = idx[0];
3067:             i2    = idx[1];
3068:             idx  += 2;
3069:             tmp0  = x[i1];
3070:             tmp1  = x[i2];
3071:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3072:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3073:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3074:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3075:           }

3077:           if (n == sz-1) {
3078:             tmp0  = x[*idx];
3079:             sum1 -= *v1*tmp0;
3080:             sum2 -= *v2*tmp0;
3081:             sum3 -= *v3*tmp0;
3082:             sum4 -= *v4*tmp0;
3083:           }
3084:           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3085:           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3086:           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3087:           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3088:           break;
3089:         case 5:

3091:           sum1 = xb[row];
3092:           sum2 = xb[row-1];
3093:           sum3 = xb[row-2];
3094:           sum4 = xb[row-3];
3095:           sum5 = xb[row-4];
3096:           v2   = a->a + diag[row-1] + 2;
3097:           v3   = a->a + diag[row-2] + 3;
3098:           v4   = a->a + diag[row-3] + 4;
3099:           v5   = a->a + diag[row-4] + 5;
3100:           for (n = 0; n<sz-1; n+=2) {
3101:             i1    = idx[0];
3102:             i2    = idx[1];
3103:             idx  += 2;
3104:             tmp0  = x[i1];
3105:             tmp1  = x[i2];
3106:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3107:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3108:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3109:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3110:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3111:           }

3113:           if (n == sz-1) {
3114:             tmp0  = x[*idx];
3115:             sum1 -= *v1*tmp0;
3116:             sum2 -= *v2*tmp0;
3117:             sum3 -= *v3*tmp0;
3118:             sum4 -= *v4*tmp0;
3119:             sum5 -= *v5*tmp0;
3120:           }
3121:           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3122:           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3123:           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3124:           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3125:           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3126:           break;
3127:         default:
3128:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3129:         }
3130:       }

3132:       PetscLogFlops(a->nz);
3133:     }
3134:     its--;
3135:   }
3136:   while (its--) {

3138:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3139:       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3140:            i<m;
3141:            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {

3143:         sz  = diag[row] - ii[row];
3144:         v1  = a->a + ii[row];
3145:         idx = a->j + ii[row];
3146:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3147:         switch (sizes[i]) {
3148:         case 1:
3149:           sum1 = b[row];
3150:           for (n = 0; n<sz-1; n+=2) {
3151:             i1    = idx[0];
3152:             i2    = idx[1];
3153:             idx  += 2;
3154:             tmp0  = x[i1];
3155:             tmp1  = x[i2];
3156:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3157:           }
3158:           if (n == sz-1) {
3159:             tmp0  = x[*idx++];
3160:             sum1 -= *v1 * tmp0;
3161:             v1++;
3162:           }
3163:           t[row]   = sum1;
3164:           sz      = ii[row+1] - diag[row] - 1;
3165:           idx     = a->j + diag[row] + 1;
3166:           v1 += 1;
3167:           for (n = 0; n<sz-1; n+=2) {
3168:             i1    = idx[0];
3169:             i2    = idx[1];
3170:             idx  += 2;
3171:             tmp0  = x[i1];
3172:             tmp1  = x[i2];
3173:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3174:           }
3175:           if (n == sz-1) {
3176:             tmp0  = x[*idx++];
3177:             sum1 -= *v1 * tmp0;
3178:           }
3179:           /* in MatSOR_SeqAIJ this line would be
3180:            *
3181:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3182:            *
3183:            * but omega == 1, so this becomes
3184:            *
3185:            * x[row] = sum1*(*ibdiag++);
3186:            *
3187:            */
3188:           x[row] = sum1*(*ibdiag);
3189:           break;
3190:         case 2:
3191:           v2   = a->a + ii[row+1];
3192:           sum1 = b[row];
3193:           sum2 = b[row+1];
3194:           for (n = 0; n<sz-1; n+=2) {
3195:             i1    = idx[0];
3196:             i2    = idx[1];
3197:             idx  += 2;
3198:             tmp0  = x[i1];
3199:             tmp1  = x[i2];
3200:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3201:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3202:           }
3203:           if (n == sz-1) {
3204:             tmp0  = x[*idx++];
3205:             sum1 -= v1[0] * tmp0;
3206:             sum2 -= v2[0] * tmp0;
3207:             v1++; v2++;
3208:           }
3209:           t[row]   = sum1;
3210:           t[row+1] = sum2;
3211:           sz      = ii[row+1] - diag[row] - 2;
3212:           idx     = a->j + diag[row] + 2;
3213:           v1 += 2;
3214:           v2 += 2;
3215:           for (n = 0; n<sz-1; n+=2) {
3216:             i1    = idx[0];
3217:             i2    = idx[1];
3218:             idx  += 2;
3219:             tmp0  = x[i1];
3220:             tmp1  = x[i2];
3221:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3222:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3223:           }
3224:           if (n == sz-1) {
3225:             tmp0  = x[*idx];
3226:             sum1 -= v1[0] * tmp0;
3227:             sum2 -= v2[0] * tmp0;
3228:           }
3229:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3230:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3231:           break;
3232:         case 3:
3233:           v2   = a->a + ii[row+1];
3234:           v3   = a->a + ii[row+2];
3235:           sum1 = b[row];
3236:           sum2 = b[row+1];
3237:           sum3 = b[row+2];
3238:           for (n = 0; n<sz-1; n+=2) {
3239:             i1    = idx[0];
3240:             i2    = idx[1];
3241:             idx  += 2;
3242:             tmp0  = x[i1];
3243:             tmp1  = x[i2];
3244:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3245:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3246:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3247:           }
3248:           if (n == sz-1) {
3249:             tmp0  = x[*idx++];
3250:             sum1 -= v1[0] * tmp0;
3251:             sum2 -= v2[0] * tmp0;
3252:             sum3 -= v3[0] * tmp0;
3253:             v1++; v2++; v3++;
3254:           }
3255:           t[row]   = sum1;
3256:           t[row+1] = sum2;
3257:           t[row+2] = sum3;
3258:           sz      = ii[row+1] - diag[row] - 3;
3259:           idx     = a->j + diag[row] + 3;
3260:           v1 += 3;
3261:           v2 += 3;
3262:           v3 += 3;
3263:           for (n = 0; n<sz-1; n+=2) {
3264:             i1    = idx[0];
3265:             i2    = idx[1];
3266:             idx  += 2;
3267:             tmp0  = x[i1];
3268:             tmp1  = x[i2];
3269:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3270:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3271:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3272:           }
3273:           if (n == sz-1) {
3274:             tmp0  = x[*idx];
3275:             sum1 -= v1[0] * tmp0;
3276:             sum2 -= v2[0] * tmp0;
3277:             sum3 -= v3[0] * tmp0;
3278:           }
3279:           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3280:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3281:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3282:           break;
3283:         case 4:
3284:           v2   = a->a + ii[row+1];
3285:           v3   = a->a + ii[row+2];
3286:           v4   = a->a + ii[row+3];
3287:           sum1 = b[row];
3288:           sum2 = b[row+1];
3289:           sum3 = b[row+2];
3290:           sum4 = b[row+3];
3291:           for (n = 0; n<sz-1; n+=2) {
3292:             i1    = idx[0];
3293:             i2    = idx[1];
3294:             idx  += 2;
3295:             tmp0  = x[i1];
3296:             tmp1  = x[i2];
3297:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3298:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3299:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3300:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3301:           }
3302:           if (n == sz-1) {
3303:             tmp0  = x[*idx++];
3304:             sum1 -= v1[0] * tmp0;
3305:             sum2 -= v2[0] * tmp0;
3306:             sum3 -= v3[0] * tmp0;
3307:             sum4 -= v4[0] * tmp0;
3308:             v1++; v2++; v3++; v4++;
3309:           }
3310:           t[row]   = sum1;
3311:           t[row+1] = sum2;
3312:           t[row+2] = sum3;
3313:           t[row+3] = sum4;
3314:           sz      = ii[row+1] - diag[row] - 4;
3315:           idx     = a->j + diag[row] + 4;
3316:           v1 += 4;
3317:           v2 += 4;
3318:           v3 += 4;
3319:           v4 += 4;
3320:           for (n = 0; n<sz-1; n+=2) {
3321:             i1    = idx[0];
3322:             i2    = idx[1];
3323:             idx  += 2;
3324:             tmp0  = x[i1];
3325:             tmp1  = x[i2];
3326:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3327:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3328:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3329:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3330:           }
3331:           if (n == sz-1) {
3332:             tmp0  = x[*idx];
3333:             sum1 -= v1[0] * tmp0;
3334:             sum2 -= v2[0] * tmp0;
3335:             sum3 -= v3[0] * tmp0;
3336:             sum4 -= v4[0] * tmp0;
3337:           }
3338:           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3339:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3340:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3341:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3342:           break;
3343:         case 5:
3344:           v2   = a->a + ii[row+1];
3345:           v3   = a->a + ii[row+2];
3346:           v4   = a->a + ii[row+3];
3347:           v5   = a->a + ii[row+4];
3348:           sum1 = b[row];
3349:           sum2 = b[row+1];
3350:           sum3 = b[row+2];
3351:           sum4 = b[row+3];
3352:           sum5 = b[row+4];
3353:           for (n = 0; n<sz-1; n+=2) {
3354:             i1    = idx[0];
3355:             i2    = idx[1];
3356:             idx  += 2;
3357:             tmp0  = x[i1];
3358:             tmp1  = x[i2];
3359:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3360:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3361:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3362:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3363:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3364:           }
3365:           if (n == sz-1) {
3366:             tmp0  = x[*idx++];
3367:             sum1 -= v1[0] * tmp0;
3368:             sum2 -= v2[0] * tmp0;
3369:             sum3 -= v3[0] * tmp0;
3370:             sum4 -= v4[0] * tmp0;
3371:             sum5 -= v5[0] * tmp0;
3372:             v1++; v2++; v3++; v4++; v5++;
3373:           }
3374:           t[row]   = sum1;
3375:           t[row+1] = sum2;
3376:           t[row+2] = sum3;
3377:           t[row+3] = sum4;
3378:           t[row+4] = sum5;
3379:           sz      = ii[row+1] - diag[row] - 5;
3380:           idx     = a->j + diag[row] + 5;
3381:           v1 += 5;
3382:           v2 += 5;
3383:           v3 += 5;
3384:           v4 += 5;
3385:           v5 += 5;
3386:           for (n = 0; n<sz-1; n+=2) {
3387:             i1    = idx[0];
3388:             i2    = idx[1];
3389:             idx  += 2;
3390:             tmp0  = x[i1];
3391:             tmp1  = x[i2];
3392:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3393:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3394:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3395:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3396:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3397:           }
3398:           if (n == sz-1) {
3399:             tmp0  = x[*idx];
3400:             sum1 -= v1[0] * tmp0;
3401:             sum2 -= v2[0] * tmp0;
3402:             sum3 -= v3[0] * tmp0;
3403:             sum4 -= v4[0] * tmp0;
3404:             sum5 -= v5[0] * tmp0;
3405:           }
3406:           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3407:           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3408:           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3409:           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3410:           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3411:           break;
3412:         default:
3413:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3414:         }
3415:       }
3416:       xb   = t;
3417:       PetscLogFlops(2.0*a->nz);  /* undercounts diag inverse */
3418:     } else xb = b;

3420:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {

3422:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3423:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3424:         ibdiag -= sizes[i]*sizes[i];

3426:         /* set RHS */
3427:         if (xb == b) {
3428:           /* whole (old way) */
3429:           sz      = ii[row+1] - ii[row];
3430:           idx     = a->j + ii[row];
3431:           switch (sizes[i]) {
3432:           case 5:
3433:             v5      = a->a + ii[row-4];
3434:           case 4: /* fall through */
3435:             v4      = a->a + ii[row-3];
3436:           case 3:
3437:             v3      = a->a + ii[row-2];
3438:           case 2:
3439:             v2      = a->a + ii[row-1];
3440:           case 1:
3441:             v1      = a->a + ii[row];
3442:             break;
3443:           default:
3444:             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3445:           }
3446:         } else {
3447:           /* upper, no diag */
3448:           sz      = ii[row+1] - diag[row] - 1;
3449:           idx     = a->j + diag[row] + 1;
3450:           switch (sizes[i]) {
3451:           case 5:
3452:             v5      = a->a + diag[row-4] + 5;
3453:           case 4: /* fall through */
3454:             v4      = a->a + diag[row-3] + 4;
3455:           case 3:
3456:             v3      = a->a + diag[row-2] + 3;
3457:           case 2:
3458:             v2      = a->a + diag[row-1] + 2;
3459:           case 1:
3460:             v1      = a->a + diag[row] + 1;
3461:           }
3462:         }
3463:         /* set sum */
3464:         switch (sizes[i]) {
3465:         case 5:
3466:           sum5 = xb[row-4];
3467:         case 4: /* fall through */
3468:           sum4 = xb[row-3];
3469:         case 3:
3470:           sum3 = xb[row-2];
3471:         case 2:
3472:           sum2 = xb[row-1];
3473:         case 1:
3474:           /* note that sum1 is associated with the last row */
3475:           sum1 = xb[row];
3476:         }
3477:         /* do sums */
3478:         for (n = 0; n<sz-1; n+=2) {
3479:             i1    = idx[0];
3480:             i2    = idx[1];
3481:             idx  += 2;
3482:             tmp0  = x[i1];
3483:             tmp1  = x[i2];
3484:             switch (sizes[i]) {
3485:             case 5:
3486:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3487:             case 4: /* fall through */
3488:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3489:             case 3:
3490:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3491:             case 2:
3492:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3493:             case 1:
3494:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3495:             }
3496:         }
3497:         /* ragged edge */
3498:         if (n == sz-1) {
3499:           tmp0  = x[*idx];
3500:           switch (sizes[i]) {
3501:           case 5:
3502:             sum5 -= *v5*tmp0;
3503:           case 4: /* fall through */
3504:             sum4 -= *v4*tmp0;
3505:           case 3:
3506:             sum3 -= *v3*tmp0;
3507:           case 2:
3508:             sum2 -= *v2*tmp0;
3509:           case 1:
3510:             sum1 -= *v1*tmp0;
3511:           }
3512:         }
3513:         /* update */
3514:         if (xb == b) {
3515:           /* whole (old way) w/ diag */
3516:           switch (sizes[i]) {
3517:           case 5:
3518:             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3519:             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3520:             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3521:             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3522:             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3523:             break;
3524:           case 4:
3525:             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3526:             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3527:             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3528:             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3529:             break;
3530:           case 3:
3531:             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3532:             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3533:             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3534:             break;
3535:           case 2:
3536:             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3537:             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3538:             break;
3539:           case 1:
3540:             x[row--] += sum1*(*ibdiag);
3541:             break;
3542:           }
3543:         } else {
3544:           /* no diag so set =  */
3545:           switch (sizes[i]) {
3546:           case 5:
3547:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3548:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3549:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3550:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3551:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3552:             break;
3553:           case 4:
3554:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3555:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3556:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3557:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3558:             break;
3559:           case 3:
3560:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3561:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3562:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3563:             break;
3564:           case 2:
3565:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3566:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3567:             break;
3568:           case 1:
3569:             x[row--] = sum1*(*ibdiag);
3570:             break;
3571:           }
3572:         }
3573:       }
3574:       if (xb == b) {
3575:         PetscLogFlops(2.0*a->nz);
3576:       } else {
3577:         PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3578:       }
3579:     }
3580:   }
3581:   if (flag & SOR_EISENSTAT) {
3582:     /*
3583:           Apply  (U + D)^-1  where D is now the block diagonal
3584:     */
3585:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3586:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3587:       ibdiag -= sizes[i]*sizes[i];
3588:       sz      = ii[row+1] - diag[row] - 1;
3589:       v1      = a->a + diag[row] + 1;
3590:       idx     = a->j + diag[row] + 1;
3591:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3592:       switch (sizes[i]) {
3593:       case 1:

3595:         sum1 = b[row];
3596:         for (n = 0; n<sz-1; n+=2) {
3597:           i1    = idx[0];
3598:           i2    = idx[1];
3599:           idx  += 2;
3600:           tmp0  = x[i1];
3601:           tmp1  = x[i2];
3602:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3603:         }

3605:         if (n == sz-1) {
3606:           tmp0  = x[*idx];
3607:           sum1 -= *v1*tmp0;
3608:         }
3609:         x[row] = sum1*(*ibdiag);row--;
3610:         break;

3612:       case 2:

3614:         sum1 = b[row];
3615:         sum2 = b[row-1];
3616:         /* note that sum1 is associated with the second of the two rows */
3617:         v2 = a->a + diag[row-1] + 2;
3618:         for (n = 0; n<sz-1; n+=2) {
3619:           i1    = idx[0];
3620:           i2    = idx[1];
3621:           idx  += 2;
3622:           tmp0  = x[i1];
3623:           tmp1  = x[i2];
3624:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3625:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3626:         }

3628:         if (n == sz-1) {
3629:           tmp0  = x[*idx];
3630:           sum1 -= *v1*tmp0;
3631:           sum2 -= *v2*tmp0;
3632:         }
3633:         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3634:         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3635:         row     -= 2;
3636:         break;
3637:       case 3:

3639:         sum1 = b[row];
3640:         sum2 = b[row-1];
3641:         sum3 = b[row-2];
3642:         v2   = a->a + diag[row-1] + 2;
3643:         v3   = a->a + diag[row-2] + 3;
3644:         for (n = 0; n<sz-1; n+=2) {
3645:           i1    = idx[0];
3646:           i2    = idx[1];
3647:           idx  += 2;
3648:           tmp0  = x[i1];
3649:           tmp1  = x[i2];
3650:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3651:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3652:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3653:         }

3655:         if (n == sz-1) {
3656:           tmp0  = x[*idx];
3657:           sum1 -= *v1*tmp0;
3658:           sum2 -= *v2*tmp0;
3659:           sum3 -= *v3*tmp0;
3660:         }
3661:         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3662:         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3663:         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3664:         row     -= 3;
3665:         break;
3666:       case 4:

3668:         sum1 = b[row];
3669:         sum2 = b[row-1];
3670:         sum3 = b[row-2];
3671:         sum4 = b[row-3];
3672:         v2   = a->a + diag[row-1] + 2;
3673:         v3   = a->a + diag[row-2] + 3;
3674:         v4   = a->a + diag[row-3] + 4;
3675:         for (n = 0; n<sz-1; n+=2) {
3676:           i1    = idx[0];
3677:           i2    = idx[1];
3678:           idx  += 2;
3679:           tmp0  = x[i1];
3680:           tmp1  = x[i2];
3681:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3682:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3683:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3684:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3685:         }

3687:         if (n == sz-1) {
3688:           tmp0  = x[*idx];
3689:           sum1 -= *v1*tmp0;
3690:           sum2 -= *v2*tmp0;
3691:           sum3 -= *v3*tmp0;
3692:           sum4 -= *v4*tmp0;
3693:         }
3694:         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3695:         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3696:         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3697:         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3698:         row     -= 4;
3699:         break;
3700:       case 5:

3702:         sum1 = b[row];
3703:         sum2 = b[row-1];
3704:         sum3 = b[row-2];
3705:         sum4 = b[row-3];
3706:         sum5 = b[row-4];
3707:         v2   = a->a + diag[row-1] + 2;
3708:         v3   = a->a + diag[row-2] + 3;
3709:         v4   = a->a + diag[row-3] + 4;
3710:         v5   = a->a + diag[row-4] + 5;
3711:         for (n = 0; n<sz-1; n+=2) {
3712:           i1    = idx[0];
3713:           i2    = idx[1];
3714:           idx  += 2;
3715:           tmp0  = x[i1];
3716:           tmp1  = x[i2];
3717:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3718:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3719:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3720:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3721:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3722:         }

3724:         if (n == sz-1) {
3725:           tmp0  = x[*idx];
3726:           sum1 -= *v1*tmp0;
3727:           sum2 -= *v2*tmp0;
3728:           sum3 -= *v3*tmp0;
3729:           sum4 -= *v4*tmp0;
3730:           sum5 -= *v5*tmp0;
3731:         }
3732:         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3733:         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3734:         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3735:         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3736:         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3737:         row     -= 5;
3738:         break;
3739:       default:
3740:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3741:       }
3742:     }
3743:     PetscLogFlops(a->nz);

3745:     /*
3746:            t = b - D x    where D is the block diagonal
3747:     */
3748:     cnt = 0;
3749:     for (i=0, row=0; i<m; i++) {
3750:       switch (sizes[i]) {
3751:       case 1:
3752:         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3753:         break;
3754:       case 2:
3755:         x1       = x[row]; x2 = x[row+1];
3756:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3757:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3758:         t[row]   = b[row] - tmp1;
3759:         t[row+1] = b[row+1] - tmp2; row += 2;
3760:         cnt     += 4;
3761:         break;
3762:       case 3:
3763:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3764:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3765:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3766:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3767:         t[row]   = b[row] - tmp1;
3768:         t[row+1] = b[row+1] - tmp2;
3769:         t[row+2] = b[row+2] - tmp3; row += 3;
3770:         cnt     += 9;
3771:         break;
3772:       case 4:
3773:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3774:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3775:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3776:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3777:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3778:         t[row]   = b[row] - tmp1;
3779:         t[row+1] = b[row+1] - tmp2;
3780:         t[row+2] = b[row+2] - tmp3;
3781:         t[row+3] = b[row+3] - tmp4; row += 4;
3782:         cnt     += 16;
3783:         break;
3784:       case 5:
3785:         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3786:         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3787:         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3788:         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3789:         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3790:         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3791:         t[row]   = b[row] - tmp1;
3792:         t[row+1] = b[row+1] - tmp2;
3793:         t[row+2] = b[row+2] - tmp3;
3794:         t[row+3] = b[row+3] - tmp4;
3795:         t[row+4] = b[row+4] - tmp5;row += 5;
3796:         cnt     += 25;
3797:         break;
3798:       default:
3799:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3800:       }
3801:     }
3802:     PetscLogFlops(m);

3804:     /*
3805:           Apply (L + D)^-1 where D is the block diagonal
3806:     */
3807:     for (i=0, row=0; i<m; i++) {
3808:       sz  = diag[row] - ii[row];
3809:       v1  = a->a + ii[row];
3810:       idx = a->j + ii[row];
3811:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3812:       switch (sizes[i]) {
3813:       case 1:

3815:         sum1 = t[row];
3816:         for (n = 0; n<sz-1; n+=2) {
3817:           i1    = idx[0];
3818:           i2    = idx[1];
3819:           idx  += 2;
3820:           tmp0  = t[i1];
3821:           tmp1  = t[i2];
3822:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3823:         }

3825:         if (n == sz-1) {
3826:           tmp0  = t[*idx];
3827:           sum1 -= *v1 * tmp0;
3828:         }
3829:         x[row] += t[row] = sum1*(*ibdiag++); row++;
3830:         break;
3831:       case 2:
3832:         v2   = a->a + ii[row+1];
3833:         sum1 = t[row];
3834:         sum2 = t[row+1];
3835:         for (n = 0; n<sz-1; n+=2) {
3836:           i1    = idx[0];
3837:           i2    = idx[1];
3838:           idx  += 2;
3839:           tmp0  = t[i1];
3840:           tmp1  = t[i2];
3841:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3842:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3843:         }

3845:         if (n == sz-1) {
3846:           tmp0  = t[*idx];
3847:           sum1 -= v1[0] * tmp0;
3848:           sum2 -= v2[0] * tmp0;
3849:         }
3850:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3851:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3852:         ibdiag   += 4; row += 2;
3853:         break;
3854:       case 3:
3855:         v2   = a->a + ii[row+1];
3856:         v3   = a->a + ii[row+2];
3857:         sum1 = t[row];
3858:         sum2 = t[row+1];
3859:         sum3 = t[row+2];
3860:         for (n = 0; n<sz-1; n+=2) {
3861:           i1    = idx[0];
3862:           i2    = idx[1];
3863:           idx  += 2;
3864:           tmp0  = t[i1];
3865:           tmp1  = t[i2];
3866:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3867:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3868:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3869:         }

3871:         if (n == sz-1) {
3872:           tmp0  = t[*idx];
3873:           sum1 -= v1[0] * tmp0;
3874:           sum2 -= v2[0] * tmp0;
3875:           sum3 -= v3[0] * tmp0;
3876:         }
3877:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3878:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3879:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3880:         ibdiag   += 9; row += 3;
3881:         break;
3882:       case 4:
3883:         v2   = a->a + ii[row+1];
3884:         v3   = a->a + ii[row+2];
3885:         v4   = a->a + ii[row+3];
3886:         sum1 = t[row];
3887:         sum2 = t[row+1];
3888:         sum3 = t[row+2];
3889:         sum4 = t[row+3];
3890:         for (n = 0; n<sz-1; n+=2) {
3891:           i1    = idx[0];
3892:           i2    = idx[1];
3893:           idx  += 2;
3894:           tmp0  = t[i1];
3895:           tmp1  = t[i2];
3896:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3897:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3898:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3899:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3900:         }

3902:         if (n == sz-1) {
3903:           tmp0  = t[*idx];
3904:           sum1 -= v1[0] * tmp0;
3905:           sum2 -= v2[0] * tmp0;
3906:           sum3 -= v3[0] * tmp0;
3907:           sum4 -= v4[0] * tmp0;
3908:         }
3909:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3910:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3911:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3912:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3913:         ibdiag   += 16; row += 4;
3914:         break;
3915:       case 5:
3916:         v2   = a->a + ii[row+1];
3917:         v3   = a->a + ii[row+2];
3918:         v4   = a->a + ii[row+3];
3919:         v5   = a->a + ii[row+4];
3920:         sum1 = t[row];
3921:         sum2 = t[row+1];
3922:         sum3 = t[row+2];
3923:         sum4 = t[row+3];
3924:         sum5 = t[row+4];
3925:         for (n = 0; n<sz-1; n+=2) {
3926:           i1    = idx[0];
3927:           i2    = idx[1];
3928:           idx  += 2;
3929:           tmp0  = t[i1];
3930:           tmp1  = t[i2];
3931:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3932:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3933:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3934:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3935:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3936:         }

3938:         if (n == sz-1) {
3939:           tmp0  = t[*idx];
3940:           sum1 -= v1[0] * tmp0;
3941:           sum2 -= v2[0] * tmp0;
3942:           sum3 -= v3[0] * tmp0;
3943:           sum4 -= v4[0] * tmp0;
3944:           sum5 -= v5[0] * tmp0;
3945:         }
3946:         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3947:         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3948:         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3949:         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3950:         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3951:         ibdiag   += 25; row += 5;
3952:         break;
3953:       default:
3954:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3955:       }
3956:     }
3957:     PetscLogFlops(a->nz);
3958:   }
3959:   VecRestoreArray(xx,&x);
3960:   VecRestoreArrayRead(bb,&b);
3961:   return(0);
3962: }

3964: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3965: {
3966:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3967:   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3968:   const MatScalar   *bdiag = a->inode.bdiag;
3969:   const PetscScalar *b;
3970:   PetscErrorCode    ierr;
3971:   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3972:   const PetscInt    *sizes = a->inode.size;

3975:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3976:   VecGetArray(xx,&x);
3977:   VecGetArrayRead(bb,&b);
3978:   cnt  = 0;
3979:   for (i=0, row=0; i<m; i++) {
3980:     switch (sizes[i]) {
3981:     case 1:
3982:       x[row] = b[row]*bdiag[cnt++];row++;
3983:       break;
3984:     case 2:
3985:       x1       = b[row]; x2 = b[row+1];
3986:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3987:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3988:       x[row++] = tmp1;
3989:       x[row++] = tmp2;
3990:       cnt     += 4;
3991:       break;
3992:     case 3:
3993:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3994:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3995:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3996:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3997:       x[row++] = tmp1;
3998:       x[row++] = tmp2;
3999:       x[row++] = tmp3;
4000:       cnt     += 9;
4001:       break;
4002:     case 4:
4003:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4004:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4005:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4006:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4007:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4008:       x[row++] = tmp1;
4009:       x[row++] = tmp2;
4010:       x[row++] = tmp3;
4011:       x[row++] = tmp4;
4012:       cnt     += 16;
4013:       break;
4014:     case 5:
4015:       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4016:       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4017:       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4018:       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4019:       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4020:       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4021:       x[row++] = tmp1;
4022:       x[row++] = tmp2;
4023:       x[row++] = tmp3;
4024:       x[row++] = tmp4;
4025:       x[row++] = tmp5;
4026:       cnt     += 25;
4027:       break;
4028:     default:
4029:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4030:     }
4031:   }
4032:   PetscLogFlops(2.0*cnt);
4033:   VecRestoreArray(xx,&x);
4034:   VecRestoreArrayRead(bb,&b);
4035:   return(0);
4036: }

4038: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4039: {
4040:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4043:   a->inode.node_count       = 0;
4044:   a->inode.use              = PETSC_FALSE;
4045:   a->inode.checked          = PETSC_FALSE;
4046:   a->inode.mat_nonzerostate = -1;
4047:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4048:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4049:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4050:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4051:   A->ops->coloringpatch     = NULL;
4052:   A->ops->multdiagonalblock = NULL;
4053:   if (A->factortype) {
4054:     A->ops->solve = MatSolve_SeqAIJ_inplace;
4055:   }
4056:   return(0);
4057: }

4059: /*
4060:     samestructure indicates that the matrix has not changed its nonzero structure so we
4061:     do not need to recompute the inodes
4062: */
4063: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4064: {
4065:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4067:   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4068:   PetscBool      flag;
4069:   const PetscInt *idx,*idy,*ii;

4072:   if (!a->inode.use) {
4073:     MatSeqAIJ_Inode_ResetOps(A);
4074:     PetscFree(a->inode.size);
4075:     return(0);
4076:   }
4077:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);

4079:   m = A->rmap->n;
4080:   if (!a->inode.size) { PetscMalloc1(m+1,&a->inode.size); }
4081:   ns = a->inode.size;

4083:   i          = 0;
4084:   node_count = 0;
4085:   idx        = a->j;
4086:   ii         = a->i;
4087:   while (i < m) {                /* For each row */
4088:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4089:     /* Limits the number of elements in a node to 'a->inode.limit' */
4090:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4091:       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4092:       if (nzy != nzx) break;
4093:       idy += nzx;              /* Same nonzero pattern */
4094:       PetscArraycmp(idx,idy,nzx,&flag);
4095:       if (!flag) break;
4096:     }
4097:     ns[node_count++] = blk_size;
4098:     idx             += blk_size*nzx;
4099:     i                = j;
4100:   }

4102:   /* If not enough inodes found,, do not use inode version of the routines */
4103:   if (!m || node_count > .8*m) {
4104:     MatSeqAIJ_Inode_ResetOps(A);
4105:     PetscFree(a->inode.size);
4106:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4107:   } else {
4108:     if (!A->factortype) {
4109:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4110:       if (A->rmap->n == A->cmap->n) {
4111:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4112:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4113:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4114:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4115:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4116:       }
4117:     } else {
4118:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4119:     }
4120:     a->inode.node_count = node_count;
4121:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4122:   }
4123:   a->inode.checked          = PETSC_TRUE;
4124:   a->inode.mat_nonzerostate = A->nonzerostate;
4125:   return(0);
4126: }

4128: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4129: {
4130:   Mat            B =*C;
4131:   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4133:   PetscInt       m=A->rmap->n;

4136:   c->inode.use              = a->inode.use;
4137:   c->inode.limit            = a->inode.limit;
4138:   c->inode.max_limit        = a->inode.max_limit;
4139:   c->inode.checked          = PETSC_FALSE;
4140:   c->inode.size             = NULL;
4141:   c->inode.node_count       = 0;
4142:   c->inode.ibdiagvalid      = PETSC_FALSE;
4143:   c->inode.ibdiag           = NULL;
4144:   c->inode.bdiag            = NULL;
4145:   c->inode.mat_nonzerostate = -1;
4146:   if (a->inode.use) {
4147:     if (a->inode.checked && a->inode.size) {
4148:       PetscMalloc1(m+1,&c->inode.size);
4149:       PetscArraycpy(c->inode.size,a->inode.size,m+1);

4151:       c->inode.checked          = PETSC_TRUE;
4152:       c->inode.node_count       = a->inode.node_count;
4153:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4154:     }
4155:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4156:     if (!B->factortype) {
4157:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4158:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4159:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4160:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4161:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4162:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4163:     } else {
4164:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4165:     }
4166:   }
4167:   return(0);
4168: }

4170: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4171: {
4172:   PetscInt       k;
4173:   const PetscInt *vi;

4176:   vi = aj + ai[row];
4177:   for (k=0; k<nzl; k++) cols[k] = vi[k];
4178:   vi        = aj + adiag[row];
4179:   cols[nzl] = vi[0];
4180:   vi        = aj + adiag[row+1]+1;
4181:   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4182:   return(0);
4183: }
4184: /*
4185:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4186:    Modified from MatSeqAIJCheckInode().

4188:    Input Parameters:
4189: .  Mat A - ILU or LU matrix factor

4191: */
4192: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4193: {
4194:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4196:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4197:   PetscInt       *cols1,*cols2,*ns;
4198:   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4199:   PetscBool      flag;

4202:   if (!a->inode.use)    return(0);
4203:   if (a->inode.checked) return(0);

4205:   m = A->rmap->n;
4206:   if (a->inode.size) ns = a->inode.size;
4207:   else {
4208:     PetscMalloc1(m+1,&ns);
4209:   }

4211:   i          = 0;
4212:   node_count = 0;
4213:   PetscMalloc2(m,&cols1,m,&cols2);
4214:   while (i < m) {                /* For each row */
4215:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4216:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4217:     nzx  = nzl1 + nzu1 + 1;
4218:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);

4220:     /* Limits the number of elements in a node to 'a->inode.limit' */
4221:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4222:       nzl2 = ai[j+1] - ai[j];
4223:       nzu2 = adiag[j] - adiag[j+1] - 1;
4224:       nzy  = nzl2 + nzu2 + 1;
4225:       if (nzy != nzx) break;
4226:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4227:       PetscArraycmp(cols1,cols2,nzx,&flag);
4228:       if (!flag) break;
4229:     }
4230:     ns[node_count++] = blk_size;
4231:     i                = j;
4232:   }
4233:   PetscFree2(cols1,cols2);
4234:   /* If not enough inodes found,, do not use inode version of the routines */
4235:   if (!m || node_count > .8*m) {
4236:     PetscFree(ns);

4238:     a->inode.node_count = 0;
4239:     a->inode.size       = NULL;
4240:     a->inode.use        = PETSC_FALSE;

4242:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4243:   } else {
4244:     A->ops->mult              = NULL;
4245:     A->ops->sor               = NULL;
4246:     A->ops->multadd           = NULL;
4247:     A->ops->getrowij          = NULL;
4248:     A->ops->restorerowij      = NULL;
4249:     A->ops->getcolumnij       = NULL;
4250:     A->ops->restorecolumnij   = NULL;
4251:     A->ops->coloringpatch     = NULL;
4252:     A->ops->multdiagonalblock = NULL;
4253:     a->inode.node_count       = node_count;
4254:     a->inode.size             = ns;

4256:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4257:   }
4258:   a->inode.checked = PETSC_TRUE;
4259:   return(0);
4260: }

4262: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4263: {
4264:   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;

4267:   a->inode.ibdiagvalid = PETSC_FALSE;
4268:   return(0);
4269: }

4271: /*
4272:      This is really ugly. if inodes are used this replaces the
4273:   permutations with ones that correspond to rows/cols of the matrix
4274:   rather then inode blocks
4275: */
4276: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4277: {

4281:   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4282:   return(0);
4283: }

4285: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4286: {
4287:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4289:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4290:   const PetscInt *ridx,*cidx;
4291:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4292:   PetscInt       nslim_col,*ns_col;
4293:   IS             ris = *rperm,cis = *cperm;

4296:   if (!a->inode.size) return(0); /* no inodes so return */
4297:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

4299:   MatCreateColInode_Private(A,&nslim_col,&ns_col);
4300:   PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4301:   PetscMalloc2(m,&permr,n,&permc);

4303:   ISGetIndices(ris,&ridx);
4304:   ISGetIndices(cis,&cidx);

4306:   /* Form the inode structure for the rows of permuted matric using inv perm*/
4307:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

4309:   /* Construct the permutations for rows*/
4310:   for (i=0,row = 0; i<nslim_row; ++i) {
4311:     indx      = ridx[i];
4312:     start_val = tns[indx];
4313:     end_val   = tns[indx + 1];
4314:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4315:   }

4317:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4318:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

4320:   /* Construct permutations for columns */
4321:   for (i=0,col=0; i<nslim_col; ++i) {
4322:     indx      = cidx[i];
4323:     start_val = tns[indx];
4324:     end_val   = tns[indx + 1];
4325:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4326:   }

4328:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4329:   ISSetPermutation(*rperm);
4330:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4331:   ISSetPermutation(*cperm);

4333:   ISRestoreIndices(ris,&ridx);
4334:   ISRestoreIndices(cis,&cidx);

4336:   PetscFree(ns_col);
4337:   PetscFree2(permr,permc);
4338:   ISDestroy(&cis);
4339:   ISDestroy(&ris);
4340:   PetscFree(tns);
4341:   return(0);
4342: }

4344: /*@C
4345:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

4347:    Not Collective

4349:    Input Parameter:
4350: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

4352:    Output Parameters:
4353: +  node_count - no of inodes present in the matrix.
4354: .  sizes      - an array of size node_count,with sizes of each inode.
4355: -  limit      - the max size used to generate the inodes.

4357:    Level: advanced

4359:    Notes:
4360:     This routine returns some internal storage information
4361:    of the matrix, it is intended to be used by advanced users.
4362:    It should be called after the matrix is assembled.
4363:    The contents of the sizes[] array should not be changed.
4364:    NULL may be passed for information not requested.

4366: .seealso: MatGetInfo()
4367: @*/
4368: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4369: {
4370:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

4373:   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4374:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4375:   if (f) {
4376:     (*f)(A,node_count,sizes,limit);
4377:   }
4378:   return(0);
4379: }

4381: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4382: {
4383:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

4386:   if (node_count) *node_count = a->inode.node_count;
4387:   if (sizes)      *sizes      = a->inode.size;
4388:   if (limit)      *limit      = a->inode.limit;
4389:   return(0);
4390: }