Actual source code: inode.c

  1: /*
  2:   This file provides high performance routines for the Inode format (compressed sparse row)
  3:   by taking advantage of rows with identical nonzero structure (I-nodes).
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #if defined(PETSC_HAVE_XMMINTRIN_H)
  7:   #include <xmmintrin.h>
  8: #endif

 10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
 13:   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;

 15:   PetscFunctionBegin;
 16:   n = A->cmap->n;
 17:   m = A->rmap->n;
 18:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
 24:       ;
 25:     for (; count + 1 < n; count++, i++)
 26:       ;
 27:     if (count < n) i++;
 28:     *size = i;
 29:     PetscFunctionReturn(PETSC_SUCCESS);
 30:   }
 31:   PetscCall(PetscMalloc1(n + 1, &ns_col));

 33:   /* Use the same row structure wherever feasible. */
 34:   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];

 36:   /* if m < n; pad up the remainder with inode_limit */
 37:   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
 38:   /* The last node is the odd ball. pad it up with the remaining rows; */
 39:   if (count < n) {
 40:     ns_col[i] = n - count;
 41:     i++;
 42:   } else if (count > n) {
 43:     /* Adjust for the over estimation */
 44:     ns_col[i - 1] += n - count;
 45:   }
 46:   *size = i;
 47:   *ns   = ns_col;
 48:   PetscFunctionReturn(PETSC_SUCCESS);
 49: }

 51: /*
 52:       This builds symmetric version of nonzero structure,
 53: */
 54: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
 55: {
 56:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
 57:   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
 58:   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
 59:   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;

 61:   PetscFunctionBegin;
 62:   nslim_row = a->inode.node_count;
 63:   m         = A->rmap->n;
 64:   n         = A->cmap->n;
 65:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 66:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");

 68:   /* Use the row_inode as column_inode */
 69:   nslim_col = nslim_row;
 70:   ns_col    = ns_row;

 72:   /* allocate space for reformatted inode structure */
 73:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
 74:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];

 76:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
 77:     nsz = ns_col[i1];
 78:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
 79:   }
 80:   /* allocate space for row pointers */
 81:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
 82:   *iia = ia;
 83:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

 85:   /* determine the number of columns in each row */
 86:   ia[0] = oshift;
 87:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
 88:     j    = aj + ai[row] + ishift;
 89:     jmax = aj + ai[row + 1] + ishift;
 90:     if (j == jmax) continue; /* empty row */
 91:     col = *j++ + ishift;
 92:     i2  = tvc[col];
 93:     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
 94:       ia[i1 + 1]++;
 95:       ia[i2 + 1]++;
 96:       i2++; /* Start col of next node */
 97:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
 98:       i2 = tvc[col];
 99:     }
100:     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101:   }

103:   /* shift ia[i] to point to next row */
104:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105:     row = ia[i1 - 1];
106:     ia[i1] += row;
107:     work[i1 - 1] = row - oshift;
108:   }

110:   /* allocate space for column pointers */
111:   nz = ia[nslim_row] + (!ishift);
112:   PetscCall(PetscMalloc1(nz, &ja));
113:   *jja = ja;

115:   /* loop over lower triangular part putting into ja */
116:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
117:     j    = aj + ai[row] + ishift;
118:     jmax = aj + ai[row + 1] + ishift;
119:     if (j == jmax) continue; /* empty row */
120:     col = *j++ + ishift;
121:     i2  = tvc[col];
122:     while (i2 < i1 && j < jmax) {
123:       ja[work[i2]++] = i1 + oshift;
124:       ja[work[i1]++] = i2 + oshift;
125:       ++i2;
126:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
127:       i2 = tvc[col];
128:     }
129:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
130:   }
131:   PetscCall(PetscFree(work));
132:   PetscCall(PetscFree2(tns, tvc));
133:   PetscFunctionReturn(PETSC_SUCCESS);
134: }

136: /*
137:       This builds nonsymmetric version of nonzero structure,
138: */
139: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
140: {
141:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
142:   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
143:   PetscInt       *tns, *tvc, nsz, i1, i2;
144:   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;

146:   PetscFunctionBegin;
147:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
148:   nslim_row = a->inode.node_count;
149:   n         = A->cmap->n;

151:   /* Create The column_inode for this matrix */
152:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

154:   /* allocate space for reformatted column_inode structure */
155:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
156:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

158:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
159:     nsz = ns_col[i1];
160:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
161:   }
162:   /* allocate space for row pointers */
163:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
164:   *iia = ia;
165:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

167:   /* determine the number of columns in each row */
168:   ia[0] = oshift;
169:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
170:     j  = aj + ai[row] + ishift;
171:     nz = ai[row + 1] - ai[row];
172:     if (!nz) continue; /* empty row */
173:     col = *j++ + ishift;
174:     i2  = tvc[col];
175:     while (nz-- > 0) { /* off-diagonal elements */
176:       ia[i1 + 1]++;
177:       i2++; /* Start col of next node */
178:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
179:       if (nz > 0) i2 = tvc[col];
180:     }
181:   }

183:   /* shift ia[i] to point to next row */
184:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
185:     row = ia[i1 - 1];
186:     ia[i1] += row;
187:     work[i1 - 1] = row - oshift;
188:   }

190:   /* allocate space for column pointers */
191:   nz = ia[nslim_row] + (!ishift);
192:   PetscCall(PetscMalloc1(nz, &ja));
193:   *jja = ja;

195:   /* loop over matrix putting into ja */
196:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
197:     j  = aj + ai[row] + ishift;
198:     nz = ai[row + 1] - ai[row];
199:     if (!nz) continue; /* empty row */
200:     col = *j++ + ishift;
201:     i2  = tvc[col];
202:     while (nz-- > 0) {
203:       ja[work[i1]++] = i2 + oshift;
204:       ++i2;
205:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
206:       if (nz > 0) i2 = tvc[col];
207:     }
208:   }
209:   PetscCall(PetscFree(ns_col));
210:   PetscCall(PetscFree(work));
211:   PetscCall(PetscFree2(tns, tvc));
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
216: {
217:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

219:   PetscFunctionBegin;
220:   if (n) *n = a->inode.node_count;
221:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
222:   if (!blockcompressed) {
223:     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
224:   } else if (symmetric) {
225:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
226:   } else {
227:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
228:   }
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
233: {
234:   PetscFunctionBegin;
235:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

237:   if (!blockcompressed) {
238:     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
239:   } else {
240:     PetscCall(PetscFree(*ia));
241:     PetscCall(PetscFree(*ja));
242:   }
243:   PetscFunctionReturn(PETSC_SUCCESS);
244: }

246: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247: {
248:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249:   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
250:   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;

252:   PetscFunctionBegin;
253:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
254:   nslim_row = a->inode.node_count;
255:   n         = A->cmap->n;

257:   /* Create The column_inode for this matrix */
258:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

260:   /* allocate space for reformatted column_inode structure */
261:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
262:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

264:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
265:     nsz = ns_col[i1];
266:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
267:   }
268:   /* allocate space for column pointers */
269:   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
270:   *iia = ia;
271:   PetscCall(PetscMalloc1(nslim_col + 1, &work));

273:   /* determine the number of columns in each row */
274:   ia[0] = oshift;
275:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
276:     j   = aj + ai[row] + ishift;
277:     col = *j++ + ishift;
278:     i2  = tvc[col];
279:     nz  = ai[row + 1] - ai[row];
280:     while (nz-- > 0) { /* off-diagonal elements */
281:       /* ia[i1+1]++; */
282:       ia[i2 + 1]++;
283:       i2++;
284:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285:       if (nz > 0) i2 = tvc[col];
286:     }
287:   }

289:   /* shift ia[i] to point to next col */
290:   for (i1 = 1; i1 < nslim_col + 1; i1++) {
291:     col = ia[i1 - 1];
292:     ia[i1] += col;
293:     work[i1 - 1] = col - oshift;
294:   }

296:   /* allocate space for column pointers */
297:   nz = ia[nslim_col] + (!ishift);
298:   PetscCall(PetscMalloc1(nz, &ja));
299:   *jja = ja;

301:   /* loop over matrix putting into ja */
302:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
303:     j   = aj + ai[row] + ishift;
304:     col = *j++ + ishift;
305:     i2  = tvc[col];
306:     nz  = ai[row + 1] - ai[row];
307:     while (nz-- > 0) {
308:       /* ja[work[i1]++] = i2 + oshift; */
309:       ja[work[i2]++] = i1 + oshift;
310:       i2++;
311:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312:       if (nz > 0) i2 = tvc[col];
313:     }
314:   }
315:   PetscCall(PetscFree(ns_col));
316:   PetscCall(PetscFree(work));
317:   PetscCall(PetscFree2(tns, tvc));
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }

321: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322: {
323:   PetscFunctionBegin;
324:   PetscCall(MatCreateColInode_Private(A, n, NULL));
325:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

327:   if (!blockcompressed) {
328:     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
329:   } else if (symmetric) {
330:     /* Since the indices are symmetric it doesn't matter */
331:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
332:   } else {
333:     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
334:   }
335:   PetscFunctionReturn(PETSC_SUCCESS);
336: }

338: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339: {
340:   PetscFunctionBegin;
341:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
342:   if (!blockcompressed) {
343:     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
344:   } else {
345:     PetscCall(PetscFree(*ia));
346:     PetscCall(PetscFree(*ja));
347:   }
348:   PetscFunctionReturn(PETSC_SUCCESS);
349: }

351: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
352: {
353:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
354:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
355:   PetscScalar       *y;
356:   const PetscScalar *x;
357:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
358:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
359:   const PetscInt    *idx, *ns, *ii;

361: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
362:   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
363: #endif

365:   PetscFunctionBegin;
366:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
367:   node_max = a->inode.node_count;
368:   ns       = a->inode.size; /* Node Size array */
369:   PetscCall(VecGetArrayRead(xx, &x));
370:   PetscCall(VecGetArray(yy, &y));
371:   idx = a->j;
372:   v1  = a->a;
373:   ii  = a->i;

375:   for (i = 0, row = 0; i < node_max; ++i) {
376:     nsz = ns[i];
377:     n   = ii[1] - ii[0];
378:     nonzerorow += (n > 0) * nsz;
379:     ii += nsz;
380:     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
381:     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
382:     sz = n;                                                                /* No of non zeros in this row */
383:                                                                            /* Switch on the size of Node */
384:     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
385:     case 1:
386:       sum1 = 0.;

388:       for (n = 0; n < sz - 1; n += 2) {
389:         i1 = idx[0]; /* The instructions are ordered to */
390:         i2 = idx[1]; /* make the compiler's job easy */
391:         idx += 2;
392:         tmp0 = x[i1];
393:         tmp1 = x[i2];
394:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
395:         v1 += 2;
396:       }

398:       if (n == sz - 1) { /* Take care of the last nonzero  */
399:         tmp0 = x[*idx++];
400:         sum1 += *v1++ * tmp0;
401:       }
402:       y[row++] = sum1;
403:       break;
404:     case 2:
405:       sum1 = 0.;
406:       sum2 = 0.;
407:       v2   = v1 + n;

409:       for (n = 0; n < sz - 1; n += 2) {
410:         i1 = idx[0];
411:         i2 = idx[1];
412:         idx += 2;
413:         tmp0 = x[i1];
414:         tmp1 = x[i2];
415:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
416:         v1 += 2;
417:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
418:         v2 += 2;
419:       }
420:       if (n == sz - 1) {
421:         tmp0 = x[*idx++];
422:         sum1 += *v1++ * tmp0;
423:         sum2 += *v2++ * tmp0;
424:       }
425:       y[row++] = sum1;
426:       y[row++] = sum2;
427:       v1       = v2; /* Since the next block to be processed starts there*/
428:       idx += sz;
429:       break;
430:     case 3:
431:       sum1 = 0.;
432:       sum2 = 0.;
433:       sum3 = 0.;
434:       v2   = v1 + n;
435:       v3   = v2 + n;

437:       for (n = 0; n < sz - 1; n += 2) {
438:         i1 = idx[0];
439:         i2 = idx[1];
440:         idx += 2;
441:         tmp0 = x[i1];
442:         tmp1 = x[i2];
443:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
444:         v1 += 2;
445:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
446:         v2 += 2;
447:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
448:         v3 += 2;
449:       }
450:       if (n == sz - 1) {
451:         tmp0 = x[*idx++];
452:         sum1 += *v1++ * tmp0;
453:         sum2 += *v2++ * tmp0;
454:         sum3 += *v3++ * tmp0;
455:       }
456:       y[row++] = sum1;
457:       y[row++] = sum2;
458:       y[row++] = sum3;
459:       v1       = v3; /* Since the next block to be processed starts there*/
460:       idx += 2 * sz;
461:       break;
462:     case 4:
463:       sum1 = 0.;
464:       sum2 = 0.;
465:       sum3 = 0.;
466:       sum4 = 0.;
467:       v2   = v1 + n;
468:       v3   = v2 + n;
469:       v4   = v3 + n;

471:       for (n = 0; n < sz - 1; n += 2) {
472:         i1 = idx[0];
473:         i2 = idx[1];
474:         idx += 2;
475:         tmp0 = x[i1];
476:         tmp1 = x[i2];
477:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
478:         v1 += 2;
479:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
480:         v2 += 2;
481:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
482:         v3 += 2;
483:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
484:         v4 += 2;
485:       }
486:       if (n == sz - 1) {
487:         tmp0 = x[*idx++];
488:         sum1 += *v1++ * tmp0;
489:         sum2 += *v2++ * tmp0;
490:         sum3 += *v3++ * tmp0;
491:         sum4 += *v4++ * tmp0;
492:       }
493:       y[row++] = sum1;
494:       y[row++] = sum2;
495:       y[row++] = sum3;
496:       y[row++] = sum4;
497:       v1       = v4; /* Since the next block to be processed starts there*/
498:       idx += 3 * sz;
499:       break;
500:     case 5:
501:       sum1 = 0.;
502:       sum2 = 0.;
503:       sum3 = 0.;
504:       sum4 = 0.;
505:       sum5 = 0.;
506:       v2   = v1 + n;
507:       v3   = v2 + n;
508:       v4   = v3 + n;
509:       v5   = v4 + n;

511:       for (n = 0; n < sz - 1; n += 2) {
512:         i1 = idx[0];
513:         i2 = idx[1];
514:         idx += 2;
515:         tmp0 = x[i1];
516:         tmp1 = x[i2];
517:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
518:         v1 += 2;
519:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
520:         v2 += 2;
521:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
522:         v3 += 2;
523:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
524:         v4 += 2;
525:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
526:         v5 += 2;
527:       }
528:       if (n == sz - 1) {
529:         tmp0 = x[*idx++];
530:         sum1 += *v1++ * tmp0;
531:         sum2 += *v2++ * tmp0;
532:         sum3 += *v3++ * tmp0;
533:         sum4 += *v4++ * tmp0;
534:         sum5 += *v5++ * tmp0;
535:       }
536:       y[row++] = sum1;
537:       y[row++] = sum2;
538:       y[row++] = sum3;
539:       y[row++] = sum4;
540:       y[row++] = sum5;
541:       v1       = v5; /* Since the next block to be processed starts there */
542:       idx += 4 * sz;
543:       break;
544:     default:
545:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
546:     }
547:   }
548:   PetscCall(VecRestoreArrayRead(xx, &x));
549:   PetscCall(VecRestoreArray(yy, &y));
550:   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
551:   PetscFunctionReturn(PETSC_SUCCESS);
552: }

554: /* Almost same code as the MatMult_SeqAIJ_Inode() */
555: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
556: {
557:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
558:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
559:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
560:   const PetscScalar *x;
561:   PetscScalar       *y, *z, *zt;
562:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
563:   const PetscInt    *idx, *ns, *ii;

565:   PetscFunctionBegin;
566:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
567:   node_max = a->inode.node_count;
568:   ns       = a->inode.size; /* Node Size array */

570:   PetscCall(VecGetArrayRead(xx, &x));
571:   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
572:   zt = z;

574:   idx = a->j;
575:   v1  = a->a;
576:   ii  = a->i;

578:   for (i = 0, row = 0; i < node_max; ++i) {
579:     nsz = ns[i];
580:     n   = ii[1] - ii[0];
581:     ii += nsz;
582:     sz = n;        /* No of non zeros in this row */
583:                    /* Switch on the size of Node */
584:     switch (nsz) { /* Each loop in 'case' is unrolled */
585:     case 1:
586:       sum1 = *zt++;

588:       for (n = 0; n < sz - 1; n += 2) {
589:         i1 = idx[0]; /* The instructions are ordered to */
590:         i2 = idx[1]; /* make the compiler's job easy */
591:         idx += 2;
592:         tmp0 = x[i1];
593:         tmp1 = x[i2];
594:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
595:         v1 += 2;
596:       }

598:       if (n == sz - 1) { /* Take care of the last nonzero  */
599:         tmp0 = x[*idx++];
600:         sum1 += *v1++ * tmp0;
601:       }
602:       y[row++] = sum1;
603:       break;
604:     case 2:
605:       sum1 = *zt++;
606:       sum2 = *zt++;
607:       v2   = v1 + n;

609:       for (n = 0; n < sz - 1; n += 2) {
610:         i1 = idx[0];
611:         i2 = idx[1];
612:         idx += 2;
613:         tmp0 = x[i1];
614:         tmp1 = x[i2];
615:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
616:         v1 += 2;
617:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
618:         v2 += 2;
619:       }
620:       if (n == sz - 1) {
621:         tmp0 = x[*idx++];
622:         sum1 += *v1++ * tmp0;
623:         sum2 += *v2++ * tmp0;
624:       }
625:       y[row++] = sum1;
626:       y[row++] = sum2;
627:       v1       = v2; /* Since the next block to be processed starts there*/
628:       idx += sz;
629:       break;
630:     case 3:
631:       sum1 = *zt++;
632:       sum2 = *zt++;
633:       sum3 = *zt++;
634:       v2   = v1 + n;
635:       v3   = v2 + n;

637:       for (n = 0; n < sz - 1; n += 2) {
638:         i1 = idx[0];
639:         i2 = idx[1];
640:         idx += 2;
641:         tmp0 = x[i1];
642:         tmp1 = x[i2];
643:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
644:         v1 += 2;
645:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
646:         v2 += 2;
647:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
648:         v3 += 2;
649:       }
650:       if (n == sz - 1) {
651:         tmp0 = x[*idx++];
652:         sum1 += *v1++ * tmp0;
653:         sum2 += *v2++ * tmp0;
654:         sum3 += *v3++ * tmp0;
655:       }
656:       y[row++] = sum1;
657:       y[row++] = sum2;
658:       y[row++] = sum3;
659:       v1       = v3; /* Since the next block to be processed starts there*/
660:       idx += 2 * sz;
661:       break;
662:     case 4:
663:       sum1 = *zt++;
664:       sum2 = *zt++;
665:       sum3 = *zt++;
666:       sum4 = *zt++;
667:       v2   = v1 + n;
668:       v3   = v2 + n;
669:       v4   = v3 + n;

671:       for (n = 0; n < sz - 1; n += 2) {
672:         i1 = idx[0];
673:         i2 = idx[1];
674:         idx += 2;
675:         tmp0 = x[i1];
676:         tmp1 = x[i2];
677:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
678:         v1 += 2;
679:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
680:         v2 += 2;
681:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
682:         v3 += 2;
683:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
684:         v4 += 2;
685:       }
686:       if (n == sz - 1) {
687:         tmp0 = x[*idx++];
688:         sum1 += *v1++ * tmp0;
689:         sum2 += *v2++ * tmp0;
690:         sum3 += *v3++ * tmp0;
691:         sum4 += *v4++ * tmp0;
692:       }
693:       y[row++] = sum1;
694:       y[row++] = sum2;
695:       y[row++] = sum3;
696:       y[row++] = sum4;
697:       v1       = v4; /* Since the next block to be processed starts there*/
698:       idx += 3 * sz;
699:       break;
700:     case 5:
701:       sum1 = *zt++;
702:       sum2 = *zt++;
703:       sum3 = *zt++;
704:       sum4 = *zt++;
705:       sum5 = *zt++;
706:       v2   = v1 + n;
707:       v3   = v2 + n;
708:       v4   = v3 + n;
709:       v5   = v4 + n;

711:       for (n = 0; n < sz - 1; n += 2) {
712:         i1 = idx[0];
713:         i2 = idx[1];
714:         idx += 2;
715:         tmp0 = x[i1];
716:         tmp1 = x[i2];
717:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
718:         v1 += 2;
719:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
720:         v2 += 2;
721:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
722:         v3 += 2;
723:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
724:         v4 += 2;
725:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
726:         v5 += 2;
727:       }
728:       if (n == sz - 1) {
729:         tmp0 = x[*idx++];
730:         sum1 += *v1++ * tmp0;
731:         sum2 += *v2++ * tmp0;
732:         sum3 += *v3++ * tmp0;
733:         sum4 += *v4++ * tmp0;
734:         sum5 += *v5++ * tmp0;
735:       }
736:       y[row++] = sum1;
737:       y[row++] = sum2;
738:       y[row++] = sum3;
739:       y[row++] = sum4;
740:       y[row++] = sum5;
741:       v1       = v5; /* Since the next block to be processed starts there */
742:       idx += 4 * sz;
743:       break;
744:     default:
745:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
746:     }
747:   }
748:   PetscCall(VecRestoreArrayRead(xx, &x));
749:   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
750:   PetscCall(PetscLogFlops(2.0 * a->nz));
751:   PetscFunctionReturn(PETSC_SUCCESS);
752: }

754: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
755: {
756:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
757:   IS                 iscol = a->col, isrow = a->row;
758:   const PetscInt    *r, *c, *rout, *cout;
759:   PetscInt           i, j, n = A->rmap->n, nz;
760:   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
761:   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
762:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
763:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
764:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
765:   const PetscScalar *b;

767:   PetscFunctionBegin;
768:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
769:   node_max = a->inode.node_count;
770:   ns       = a->inode.size; /* Node Size array */

772:   PetscCall(VecGetArrayRead(bb, &b));
773:   PetscCall(VecGetArrayWrite(xx, &x));
774:   tmp = a->solve_work;

776:   PetscCall(ISGetIndices(isrow, &rout));
777:   r = rout;
778:   PetscCall(ISGetIndices(iscol, &cout));
779:   c = cout + (n - 1);

781:   /* forward solve the lower triangular */
782:   tmps = tmp;
783:   aa   = a_a;
784:   aj   = a_j;
785:   ad   = a->diag;

787:   for (i = 0, row = 0; i < node_max; ++i) {
788:     nsz = ns[i];
789:     aii = ai[row];
790:     v1  = aa + aii;
791:     vi  = aj + aii;
792:     nz  = ad[row] - aii;
793:     if (i < node_max - 1) {
794:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
795:       * but our indexing to determine its size could. */
796:       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
797:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
798:       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
799:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
800:     }

802:     switch (nsz) { /* Each loop in 'case' is unrolled */
803:     case 1:
804:       sum1 = b[*r++];
805:       for (j = 0; j < nz - 1; j += 2) {
806:         i0 = vi[0];
807:         i1 = vi[1];
808:         vi += 2;
809:         tmp0 = tmps[i0];
810:         tmp1 = tmps[i1];
811:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
812:         v1 += 2;
813:       }
814:       if (j == nz - 1) {
815:         tmp0 = tmps[*vi++];
816:         sum1 -= *v1++ * tmp0;
817:       }
818:       tmp[row++] = sum1;
819:       break;
820:     case 2:
821:       sum1 = b[*r++];
822:       sum2 = b[*r++];
823:       v2   = aa + ai[row + 1];

825:       for (j = 0; j < nz - 1; j += 2) {
826:         i0 = vi[0];
827:         i1 = vi[1];
828:         vi += 2;
829:         tmp0 = tmps[i0];
830:         tmp1 = tmps[i1];
831:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
832:         v1 += 2;
833:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
834:         v2 += 2;
835:       }
836:       if (j == nz - 1) {
837:         tmp0 = tmps[*vi++];
838:         sum1 -= *v1++ * tmp0;
839:         sum2 -= *v2++ * tmp0;
840:       }
841:       sum2 -= *v2++ * sum1;
842:       tmp[row++] = sum1;
843:       tmp[row++] = sum2;
844:       break;
845:     case 3:
846:       sum1 = b[*r++];
847:       sum2 = b[*r++];
848:       sum3 = b[*r++];
849:       v2   = aa + ai[row + 1];
850:       v3   = aa + ai[row + 2];

852:       for (j = 0; j < nz - 1; j += 2) {
853:         i0 = vi[0];
854:         i1 = vi[1];
855:         vi += 2;
856:         tmp0 = tmps[i0];
857:         tmp1 = tmps[i1];
858:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
859:         v1 += 2;
860:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
861:         v2 += 2;
862:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
863:         v3 += 2;
864:       }
865:       if (j == nz - 1) {
866:         tmp0 = tmps[*vi++];
867:         sum1 -= *v1++ * tmp0;
868:         sum2 -= *v2++ * tmp0;
869:         sum3 -= *v3++ * tmp0;
870:       }
871:       sum2 -= *v2++ * sum1;
872:       sum3 -= *v3++ * sum1;
873:       sum3 -= *v3++ * sum2;

875:       tmp[row++] = sum1;
876:       tmp[row++] = sum2;
877:       tmp[row++] = sum3;
878:       break;

880:     case 4:
881:       sum1 = b[*r++];
882:       sum2 = b[*r++];
883:       sum3 = b[*r++];
884:       sum4 = b[*r++];
885:       v2   = aa + ai[row + 1];
886:       v3   = aa + ai[row + 2];
887:       v4   = aa + ai[row + 3];

889:       for (j = 0; j < nz - 1; j += 2) {
890:         i0 = vi[0];
891:         i1 = vi[1];
892:         vi += 2;
893:         tmp0 = tmps[i0];
894:         tmp1 = tmps[i1];
895:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
896:         v1 += 2;
897:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
898:         v2 += 2;
899:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
900:         v3 += 2;
901:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
902:         v4 += 2;
903:       }
904:       if (j == nz - 1) {
905:         tmp0 = tmps[*vi++];
906:         sum1 -= *v1++ * tmp0;
907:         sum2 -= *v2++ * tmp0;
908:         sum3 -= *v3++ * tmp0;
909:         sum4 -= *v4++ * tmp0;
910:       }
911:       sum2 -= *v2++ * sum1;
912:       sum3 -= *v3++ * sum1;
913:       sum4 -= *v4++ * sum1;
914:       sum3 -= *v3++ * sum2;
915:       sum4 -= *v4++ * sum2;
916:       sum4 -= *v4++ * sum3;

918:       tmp[row++] = sum1;
919:       tmp[row++] = sum2;
920:       tmp[row++] = sum3;
921:       tmp[row++] = sum4;
922:       break;
923:     case 5:
924:       sum1 = b[*r++];
925:       sum2 = b[*r++];
926:       sum3 = b[*r++];
927:       sum4 = b[*r++];
928:       sum5 = b[*r++];
929:       v2   = aa + ai[row + 1];
930:       v3   = aa + ai[row + 2];
931:       v4   = aa + ai[row + 3];
932:       v5   = aa + ai[row + 4];

934:       for (j = 0; j < nz - 1; j += 2) {
935:         i0 = vi[0];
936:         i1 = vi[1];
937:         vi += 2;
938:         tmp0 = tmps[i0];
939:         tmp1 = tmps[i1];
940:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
941:         v1 += 2;
942:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
943:         v2 += 2;
944:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
945:         v3 += 2;
946:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
947:         v4 += 2;
948:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
949:         v5 += 2;
950:       }
951:       if (j == nz - 1) {
952:         tmp0 = tmps[*vi++];
953:         sum1 -= *v1++ * tmp0;
954:         sum2 -= *v2++ * tmp0;
955:         sum3 -= *v3++ * tmp0;
956:         sum4 -= *v4++ * tmp0;
957:         sum5 -= *v5++ * tmp0;
958:       }

960:       sum2 -= *v2++ * sum1;
961:       sum3 -= *v3++ * sum1;
962:       sum4 -= *v4++ * sum1;
963:       sum5 -= *v5++ * sum1;
964:       sum3 -= *v3++ * sum2;
965:       sum4 -= *v4++ * sum2;
966:       sum5 -= *v5++ * sum2;
967:       sum4 -= *v4++ * sum3;
968:       sum5 -= *v5++ * sum3;
969:       sum5 -= *v5++ * sum4;

971:       tmp[row++] = sum1;
972:       tmp[row++] = sum2;
973:       tmp[row++] = sum3;
974:       tmp[row++] = sum4;
975:       tmp[row++] = sum5;
976:       break;
977:     default:
978:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
979:     }
980:   }
981:   /* backward solve the upper triangular */
982:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
983:     nsz = ns[i];
984:     aii = ai[row + 1] - 1;
985:     v1  = aa + aii;
986:     vi  = aj + aii;
987:     nz  = aii - ad[row];
988:     switch (nsz) { /* Each loop in 'case' is unrolled */
989:     case 1:
990:       sum1 = tmp[row];

992:       for (j = nz; j > 1; j -= 2) {
993:         vi -= 2;
994:         i0   = vi[2];
995:         i1   = vi[1];
996:         tmp0 = tmps[i0];
997:         tmp1 = tmps[i1];
998:         v1 -= 2;
999:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000:       }
1001:       if (j == 1) {
1002:         tmp0 = tmps[*vi--];
1003:         sum1 -= *v1-- * tmp0;
1004:       }
1005:       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1006:       row--;
1007:       break;
1008:     case 2:
1009:       sum1 = tmp[row];
1010:       sum2 = tmp[row - 1];
1011:       v2   = aa + ai[row] - 1;
1012:       for (j = nz; j > 1; j -= 2) {
1013:         vi -= 2;
1014:         i0   = vi[2];
1015:         i1   = vi[1];
1016:         tmp0 = tmps[i0];
1017:         tmp1 = tmps[i1];
1018:         v1 -= 2;
1019:         v2 -= 2;
1020:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022:       }
1023:       if (j == 1) {
1024:         tmp0 = tmps[*vi--];
1025:         sum1 -= *v1-- * tmp0;
1026:         sum2 -= *v2-- * tmp0;
1027:       }

1029:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1030:       row--;
1031:       sum2 -= *v2-- * tmp0;
1032:       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1033:       row--;
1034:       break;
1035:     case 3:
1036:       sum1 = tmp[row];
1037:       sum2 = tmp[row - 1];
1038:       sum3 = tmp[row - 2];
1039:       v2   = aa + ai[row] - 1;
1040:       v3   = aa + ai[row - 1] - 1;
1041:       for (j = nz; j > 1; j -= 2) {
1042:         vi -= 2;
1043:         i0   = vi[2];
1044:         i1   = vi[1];
1045:         tmp0 = tmps[i0];
1046:         tmp1 = tmps[i1];
1047:         v1 -= 2;
1048:         v2 -= 2;
1049:         v3 -= 2;
1050:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1051:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1052:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1053:       }
1054:       if (j == 1) {
1055:         tmp0 = tmps[*vi--];
1056:         sum1 -= *v1-- * tmp0;
1057:         sum2 -= *v2-- * tmp0;
1058:         sum3 -= *v3-- * tmp0;
1059:       }
1060:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1061:       row--;
1062:       sum2 -= *v2-- * tmp0;
1063:       sum3 -= *v3-- * tmp0;
1064:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1065:       row--;
1066:       sum3 -= *v3-- * tmp0;
1067:       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1068:       row--;

1070:       break;
1071:     case 4:
1072:       sum1 = tmp[row];
1073:       sum2 = tmp[row - 1];
1074:       sum3 = tmp[row - 2];
1075:       sum4 = tmp[row - 3];
1076:       v2   = aa + ai[row] - 1;
1077:       v3   = aa + ai[row - 1] - 1;
1078:       v4   = aa + ai[row - 2] - 1;

1080:       for (j = nz; j > 1; j -= 2) {
1081:         vi -= 2;
1082:         i0   = vi[2];
1083:         i1   = vi[1];
1084:         tmp0 = tmps[i0];
1085:         tmp1 = tmps[i1];
1086:         v1 -= 2;
1087:         v2 -= 2;
1088:         v3 -= 2;
1089:         v4 -= 2;
1090:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1091:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1092:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1093:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1094:       }
1095:       if (j == 1) {
1096:         tmp0 = tmps[*vi--];
1097:         sum1 -= *v1-- * tmp0;
1098:         sum2 -= *v2-- * tmp0;
1099:         sum3 -= *v3-- * tmp0;
1100:         sum4 -= *v4-- * tmp0;
1101:       }

1103:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1104:       row--;
1105:       sum2 -= *v2-- * tmp0;
1106:       sum3 -= *v3-- * tmp0;
1107:       sum4 -= *v4-- * tmp0;
1108:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1109:       row--;
1110:       sum3 -= *v3-- * tmp0;
1111:       sum4 -= *v4-- * tmp0;
1112:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1113:       row--;
1114:       sum4 -= *v4-- * tmp0;
1115:       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1116:       row--;
1117:       break;
1118:     case 5:
1119:       sum1 = tmp[row];
1120:       sum2 = tmp[row - 1];
1121:       sum3 = tmp[row - 2];
1122:       sum4 = tmp[row - 3];
1123:       sum5 = tmp[row - 4];
1124:       v2   = aa + ai[row] - 1;
1125:       v3   = aa + ai[row - 1] - 1;
1126:       v4   = aa + ai[row - 2] - 1;
1127:       v5   = aa + ai[row - 3] - 1;
1128:       for (j = nz; j > 1; j -= 2) {
1129:         vi -= 2;
1130:         i0   = vi[2];
1131:         i1   = vi[1];
1132:         tmp0 = tmps[i0];
1133:         tmp1 = tmps[i1];
1134:         v1 -= 2;
1135:         v2 -= 2;
1136:         v3 -= 2;
1137:         v4 -= 2;
1138:         v5 -= 2;
1139:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1140:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1141:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1142:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1143:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1144:       }
1145:       if (j == 1) {
1146:         tmp0 = tmps[*vi--];
1147:         sum1 -= *v1-- * tmp0;
1148:         sum2 -= *v2-- * tmp0;
1149:         sum3 -= *v3-- * tmp0;
1150:         sum4 -= *v4-- * tmp0;
1151:         sum5 -= *v5-- * tmp0;
1152:       }

1154:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1155:       row--;
1156:       sum2 -= *v2-- * tmp0;
1157:       sum3 -= *v3-- * tmp0;
1158:       sum4 -= *v4-- * tmp0;
1159:       sum5 -= *v5-- * tmp0;
1160:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1161:       row--;
1162:       sum3 -= *v3-- * tmp0;
1163:       sum4 -= *v4-- * tmp0;
1164:       sum5 -= *v5-- * tmp0;
1165:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1166:       row--;
1167:       sum4 -= *v4-- * tmp0;
1168:       sum5 -= *v5-- * tmp0;
1169:       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1170:       row--;
1171:       sum5 -= *v5-- * tmp0;
1172:       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1173:       row--;
1174:       break;
1175:     default:
1176:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1177:     }
1178:   }
1179:   PetscCall(ISRestoreIndices(isrow, &rout));
1180:   PetscCall(ISRestoreIndices(iscol, &cout));
1181:   PetscCall(VecRestoreArrayRead(bb, &b));
1182:   PetscCall(VecRestoreArrayWrite(xx, &x));
1183:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1184:   PetscFunctionReturn(PETSC_SUCCESS);
1185: }

1187: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1188: {
1189:   Mat              C = B;
1190:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1191:   IS               isrow = b->row, isicol = b->icol;
1192:   const PetscInt  *r, *ic, *ics;
1193:   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1194:   PetscInt         i, j, k, nz, nzL, row, *pj;
1195:   const PetscInt  *ajtmp, *bjtmp;
1196:   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1197:   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1198:   FactorShiftCtx   sctx;
1199:   const PetscInt  *ddiag;
1200:   PetscReal        rs;
1201:   MatScalar        d;
1202:   PetscInt         inod, nodesz, node_max, col;
1203:   const PetscInt  *ns;
1204:   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;

1206:   PetscFunctionBegin;
1207:   /* MatPivotSetUp(): initialize shift context sctx */
1208:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1210:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1211:     ddiag          = a->diag;
1212:     sctx.shift_top = info->zeropivot;
1213:     for (i = 0; i < n; i++) {
1214:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1215:       d  = (aa)[ddiag[i]];
1216:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1217:       v  = aa + ai[i];
1218:       nz = ai[i + 1] - ai[i];
1219:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1220:       if (rs > sctx.shift_top) sctx.shift_top = rs;
1221:     }
1222:     sctx.shift_top *= 1.1;
1223:     sctx.nshift_max = 5;
1224:     sctx.shift_lo   = 0.;
1225:     sctx.shift_hi   = 1.;
1226:   }

1228:   PetscCall(ISGetIndices(isrow, &r));
1229:   PetscCall(ISGetIndices(isicol, &ic));

1231:   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1232:   ics = ic;

1234:   node_max = a->inode.node_count;
1235:   ns       = a->inode.size;
1236:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

1238:   /* If max inode size > 4, split it into two inodes.*/
1239:   /* also map the inode sizes according to the ordering */
1240:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1241:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1242:     if (ns[i] > 4) {
1243:       tmp_vec1[j] = 4;
1244:       ++j;
1245:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1246:     } else {
1247:       tmp_vec1[j] = ns[i];
1248:     }
1249:   }
1250:   /* Use the correct node_max */
1251:   node_max = j;

1253:   /* Now reorder the inode info based on mat re-ordering info */
1254:   /* First create a row -> inode_size_array_index map */
1255:   PetscCall(PetscMalloc1(n + 1, &nsmap));
1256:   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1257:   for (i = 0, row = 0; i < node_max; i++) {
1258:     nodesz = tmp_vec1[i];
1259:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1260:   }
1261:   /* Using nsmap, create a reordered ns structure */
1262:   for (i = 0, j = 0; i < node_max; i++) {
1263:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1264:     tmp_vec2[i] = nodesz;
1265:     j += nodesz;
1266:   }
1267:   PetscCall(PetscFree(nsmap));
1268:   PetscCall(PetscFree(tmp_vec1));

1270:   /* Now use the correct ns */
1271:   ns = tmp_vec2;

1273:   do {
1274:     sctx.newshift = PETSC_FALSE;
1275:     /* Now loop over each block-row, and do the factorization */
1276:     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1277:       nodesz = ns[inod];

1279:       switch (nodesz) {
1280:       case 1:
1281:         /* zero rtmp1 */
1282:         /* L part */
1283:         nz    = bi[i + 1] - bi[i];
1284:         bjtmp = bj + bi[i];
1285:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1287:         /* U part */
1288:         nz    = bdiag[i] - bdiag[i + 1];
1289:         bjtmp = bj + bdiag[i + 1] + 1;
1290:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1292:         /* load in initial (unfactored row) */
1293:         nz    = ai[r[i] + 1] - ai[r[i]];
1294:         ajtmp = aj + ai[r[i]];
1295:         v     = aa + ai[r[i]];
1296:         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1298:         /* ZeropivotApply() */
1299:         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

1301:         /* elimination */
1302:         bjtmp = bj + bi[i];
1303:         row   = *bjtmp++;
1304:         nzL   = bi[i + 1] - bi[i];
1305:         for (k = 0; k < nzL; k++) {
1306:           pc = rtmp1 + row;
1307:           if (*pc != 0.0) {
1308:             pv   = b->a + bdiag[row];
1309:             mul1 = *pc * (*pv);
1310:             *pc  = mul1;
1311:             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1312:             pv   = b->a + bdiag[row + 1] + 1;
1313:             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1314:             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1315:             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1316:           }
1317:           row = *bjtmp++;
1318:         }

1320:         /* finished row so stick it into b->a */
1321:         rs = 0.0;
1322:         /* L part */
1323:         pv = b->a + bi[i];
1324:         pj = b->j + bi[i];
1325:         nz = bi[i + 1] - bi[i];
1326:         for (j = 0; j < nz; j++) {
1327:           pv[j] = rtmp1[pj[j]];
1328:           rs += PetscAbsScalar(pv[j]);
1329:         }

1331:         /* U part */
1332:         pv = b->a + bdiag[i + 1] + 1;
1333:         pj = b->j + bdiag[i + 1] + 1;
1334:         nz = bdiag[i] - bdiag[i + 1] - 1;
1335:         for (j = 0; j < nz; j++) {
1336:           pv[j] = rtmp1[pj[j]];
1337:           rs += PetscAbsScalar(pv[j]);
1338:         }

1340:         /* Check zero pivot */
1341:         sctx.rs = rs;
1342:         sctx.pv = rtmp1[i];
1343:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1344:         if (sctx.newshift) break;

1346:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1347:         pv  = b->a + bdiag[i];
1348:         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1349:         break;

1351:       case 2:
1352:         /* zero rtmp1 and rtmp2 */
1353:         /* L part */
1354:         nz    = bi[i + 1] - bi[i];
1355:         bjtmp = bj + bi[i];
1356:         for (j = 0; j < nz; j++) {
1357:           col        = bjtmp[j];
1358:           rtmp1[col] = 0.0;
1359:           rtmp2[col] = 0.0;
1360:         }

1362:         /* U part */
1363:         nz    = bdiag[i] - bdiag[i + 1];
1364:         bjtmp = bj + bdiag[i + 1] + 1;
1365:         for (j = 0; j < nz; j++) {
1366:           col        = bjtmp[j];
1367:           rtmp1[col] = 0.0;
1368:           rtmp2[col] = 0.0;
1369:         }

1371:         /* load in initial (unfactored row) */
1372:         nz    = ai[r[i] + 1] - ai[r[i]];
1373:         ajtmp = aj + ai[r[i]];
1374:         v1    = aa + ai[r[i]];
1375:         v2    = aa + ai[r[i] + 1];
1376:         for (j = 0; j < nz; j++) {
1377:           col        = ics[ajtmp[j]];
1378:           rtmp1[col] = v1[j];
1379:           rtmp2[col] = v2[j];
1380:         }
1381:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382:         rtmp1[i] += sctx.shift_amount;
1383:         rtmp2[i + 1] += sctx.shift_amount;

1385:         /* elimination */
1386:         bjtmp = bj + bi[i];
1387:         row   = *bjtmp++; /* pivot row */
1388:         nzL   = bi[i + 1] - bi[i];
1389:         for (k = 0; k < nzL; k++) {
1390:           pc1 = rtmp1 + row;
1391:           pc2 = rtmp2 + row;
1392:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393:             pv   = b->a + bdiag[row];
1394:             mul1 = *pc1 * (*pv);
1395:             mul2 = *pc2 * (*pv);
1396:             *pc1 = mul1;
1397:             *pc2 = mul2;

1399:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1400:             pv = b->a + bdiag[row + 1] + 1;
1401:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1402:             for (j = 0; j < nz; j++) {
1403:               col = pj[j];
1404:               rtmp1[col] -= mul1 * pv[j];
1405:               rtmp2[col] -= mul2 * pv[j];
1406:             }
1407:             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1408:           }
1409:           row = *bjtmp++;
1410:         }

1412:         /* finished row i; check zero pivot, then stick row i into b->a */
1413:         rs = 0.0;
1414:         /* L part */
1415:         pc1 = b->a + bi[i];
1416:         pj  = b->j + bi[i];
1417:         nz  = bi[i + 1] - bi[i];
1418:         for (j = 0; j < nz; j++) {
1419:           col    = pj[j];
1420:           pc1[j] = rtmp1[col];
1421:           rs += PetscAbsScalar(pc1[j]);
1422:         }
1423:         /* U part */
1424:         pc1 = b->a + bdiag[i + 1] + 1;
1425:         pj  = b->j + bdiag[i + 1] + 1;
1426:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1427:         for (j = 0; j < nz; j++) {
1428:           col    = pj[j];
1429:           pc1[j] = rtmp1[col];
1430:           rs += PetscAbsScalar(pc1[j]);
1431:         }

1433:         sctx.rs = rs;
1434:         sctx.pv = rtmp1[i];
1435:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1436:         if (sctx.newshift) break;
1437:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1438:         *pc1 = 1.0 / sctx.pv;

1440:         /* Now take care of diagonal 2x2 block. */
1441:         pc2 = rtmp2 + i;
1442:         if (*pc2 != 0.0) {
1443:           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1444:           *pc2 = mul1;                        /* insert L entry */
1445:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1446:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1447:           for (j = 0; j < nz; j++) {
1448:             col = pj[j];
1449:             rtmp2[col] -= mul1 * rtmp1[col];
1450:           }
1451:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1452:         }

1454:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1455:         rs = 0.0;
1456:         /* L part */
1457:         pc2 = b->a + bi[i + 1];
1458:         pj  = b->j + bi[i + 1];
1459:         nz  = bi[i + 2] - bi[i + 1];
1460:         for (j = 0; j < nz; j++) {
1461:           col    = pj[j];
1462:           pc2[j] = rtmp2[col];
1463:           rs += PetscAbsScalar(pc2[j]);
1464:         }
1465:         /* U part */
1466:         pc2 = b->a + bdiag[i + 2] + 1;
1467:         pj  = b->j + bdiag[i + 2] + 1;
1468:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1469:         for (j = 0; j < nz; j++) {
1470:           col    = pj[j];
1471:           pc2[j] = rtmp2[col];
1472:           rs += PetscAbsScalar(pc2[j]);
1473:         }

1475:         sctx.rs = rs;
1476:         sctx.pv = rtmp2[i + 1];
1477:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1478:         if (sctx.newshift) break;
1479:         pc2  = b->a + bdiag[i + 1];
1480:         *pc2 = 1.0 / sctx.pv;
1481:         break;

1483:       case 3:
1484:         /* zero rtmp */
1485:         /* L part */
1486:         nz    = bi[i + 1] - bi[i];
1487:         bjtmp = bj + bi[i];
1488:         for (j = 0; j < nz; j++) {
1489:           col        = bjtmp[j];
1490:           rtmp1[col] = 0.0;
1491:           rtmp2[col] = 0.0;
1492:           rtmp3[col] = 0.0;
1493:         }

1495:         /* U part */
1496:         nz    = bdiag[i] - bdiag[i + 1];
1497:         bjtmp = bj + bdiag[i + 1] + 1;
1498:         for (j = 0; j < nz; j++) {
1499:           col        = bjtmp[j];
1500:           rtmp1[col] = 0.0;
1501:           rtmp2[col] = 0.0;
1502:           rtmp3[col] = 0.0;
1503:         }

1505:         /* load in initial (unfactored row) */
1506:         nz    = ai[r[i] + 1] - ai[r[i]];
1507:         ajtmp = aj + ai[r[i]];
1508:         v1    = aa + ai[r[i]];
1509:         v2    = aa + ai[r[i] + 1];
1510:         v3    = aa + ai[r[i] + 2];
1511:         for (j = 0; j < nz; j++) {
1512:           col        = ics[ajtmp[j]];
1513:           rtmp1[col] = v1[j];
1514:           rtmp2[col] = v2[j];
1515:           rtmp3[col] = v3[j];
1516:         }
1517:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1518:         rtmp1[i] += sctx.shift_amount;
1519:         rtmp2[i + 1] += sctx.shift_amount;
1520:         rtmp3[i + 2] += sctx.shift_amount;

1522:         /* elimination */
1523:         bjtmp = bj + bi[i];
1524:         row   = *bjtmp++; /* pivot row */
1525:         nzL   = bi[i + 1] - bi[i];
1526:         for (k = 0; k < nzL; k++) {
1527:           pc1 = rtmp1 + row;
1528:           pc2 = rtmp2 + row;
1529:           pc3 = rtmp3 + row;
1530:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1531:             pv   = b->a + bdiag[row];
1532:             mul1 = *pc1 * (*pv);
1533:             mul2 = *pc2 * (*pv);
1534:             mul3 = *pc3 * (*pv);
1535:             *pc1 = mul1;
1536:             *pc2 = mul2;
1537:             *pc3 = mul3;

1539:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1540:             pv = b->a + bdiag[row + 1] + 1;
1541:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1542:             for (j = 0; j < nz; j++) {
1543:               col = pj[j];
1544:               rtmp1[col] -= mul1 * pv[j];
1545:               rtmp2[col] -= mul2 * pv[j];
1546:               rtmp3[col] -= mul3 * pv[j];
1547:             }
1548:             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1549:           }
1550:           row = *bjtmp++;
1551:         }

1553:         /* finished row i; check zero pivot, then stick row i into b->a */
1554:         rs = 0.0;
1555:         /* L part */
1556:         pc1 = b->a + bi[i];
1557:         pj  = b->j + bi[i];
1558:         nz  = bi[i + 1] - bi[i];
1559:         for (j = 0; j < nz; j++) {
1560:           col    = pj[j];
1561:           pc1[j] = rtmp1[col];
1562:           rs += PetscAbsScalar(pc1[j]);
1563:         }
1564:         /* U part */
1565:         pc1 = b->a + bdiag[i + 1] + 1;
1566:         pj  = b->j + bdiag[i + 1] + 1;
1567:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1568:         for (j = 0; j < nz; j++) {
1569:           col    = pj[j];
1570:           pc1[j] = rtmp1[col];
1571:           rs += PetscAbsScalar(pc1[j]);
1572:         }

1574:         sctx.rs = rs;
1575:         sctx.pv = rtmp1[i];
1576:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1577:         if (sctx.newshift) break;
1578:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1579:         *pc1 = 1.0 / sctx.pv;

1581:         /* Now take care of 1st column of diagonal 3x3 block. */
1582:         pc2 = rtmp2 + i;
1583:         pc3 = rtmp3 + i;
1584:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1585:           mul2 = (*pc2) * (*pc1);
1586:           *pc2 = mul2;
1587:           mul3 = (*pc3) * (*pc1);
1588:           *pc3 = mul3;
1589:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1590:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1591:           for (j = 0; j < nz; j++) {
1592:             col = pj[j];
1593:             rtmp2[col] -= mul2 * rtmp1[col];
1594:             rtmp3[col] -= mul3 * rtmp1[col];
1595:           }
1596:           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1597:         }

1599:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1600:         rs = 0.0;
1601:         /* L part */
1602:         pc2 = b->a + bi[i + 1];
1603:         pj  = b->j + bi[i + 1];
1604:         nz  = bi[i + 2] - bi[i + 1];
1605:         for (j = 0; j < nz; j++) {
1606:           col    = pj[j];
1607:           pc2[j] = rtmp2[col];
1608:           rs += PetscAbsScalar(pc2[j]);
1609:         }
1610:         /* U part */
1611:         pc2 = b->a + bdiag[i + 2] + 1;
1612:         pj  = b->j + bdiag[i + 2] + 1;
1613:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1614:         for (j = 0; j < nz; j++) {
1615:           col    = pj[j];
1616:           pc2[j] = rtmp2[col];
1617:           rs += PetscAbsScalar(pc2[j]);
1618:         }

1620:         sctx.rs = rs;
1621:         sctx.pv = rtmp2[i + 1];
1622:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1623:         if (sctx.newshift) break;
1624:         pc2  = b->a + bdiag[i + 1];
1625:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1627:         /* Now take care of 2nd column of diagonal 3x3 block. */
1628:         pc3 = rtmp3 + i + 1;
1629:         if (*pc3 != 0.0) {
1630:           mul3 = (*pc3) * (*pc2);
1631:           *pc3 = mul3;
1632:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1633:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1634:           for (j = 0; j < nz; j++) {
1635:             col = pj[j];
1636:             rtmp3[col] -= mul3 * rtmp2[col];
1637:           }
1638:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1639:         }

1641:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1642:         rs = 0.0;
1643:         /* L part */
1644:         pc3 = b->a + bi[i + 2];
1645:         pj  = b->j + bi[i + 2];
1646:         nz  = bi[i + 3] - bi[i + 2];
1647:         for (j = 0; j < nz; j++) {
1648:           col    = pj[j];
1649:           pc3[j] = rtmp3[col];
1650:           rs += PetscAbsScalar(pc3[j]);
1651:         }
1652:         /* U part */
1653:         pc3 = b->a + bdiag[i + 3] + 1;
1654:         pj  = b->j + bdiag[i + 3] + 1;
1655:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1656:         for (j = 0; j < nz; j++) {
1657:           col    = pj[j];
1658:           pc3[j] = rtmp3[col];
1659:           rs += PetscAbsScalar(pc3[j]);
1660:         }

1662:         sctx.rs = rs;
1663:         sctx.pv = rtmp3[i + 2];
1664:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1665:         if (sctx.newshift) break;
1666:         pc3  = b->a + bdiag[i + 2];
1667:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1668:         break;
1669:       case 4:
1670:         /* zero rtmp */
1671:         /* L part */
1672:         nz    = bi[i + 1] - bi[i];
1673:         bjtmp = bj + bi[i];
1674:         for (j = 0; j < nz; j++) {
1675:           col        = bjtmp[j];
1676:           rtmp1[col] = 0.0;
1677:           rtmp2[col] = 0.0;
1678:           rtmp3[col] = 0.0;
1679:           rtmp4[col] = 0.0;
1680:         }

1682:         /* U part */
1683:         nz    = bdiag[i] - bdiag[i + 1];
1684:         bjtmp = bj + bdiag[i + 1] + 1;
1685:         for (j = 0; j < nz; j++) {
1686:           col        = bjtmp[j];
1687:           rtmp1[col] = 0.0;
1688:           rtmp2[col] = 0.0;
1689:           rtmp3[col] = 0.0;
1690:           rtmp4[col] = 0.0;
1691:         }

1693:         /* load in initial (unfactored row) */
1694:         nz    = ai[r[i] + 1] - ai[r[i]];
1695:         ajtmp = aj + ai[r[i]];
1696:         v1    = aa + ai[r[i]];
1697:         v2    = aa + ai[r[i] + 1];
1698:         v3    = aa + ai[r[i] + 2];
1699:         v4    = aa + ai[r[i] + 3];
1700:         for (j = 0; j < nz; j++) {
1701:           col        = ics[ajtmp[j]];
1702:           rtmp1[col] = v1[j];
1703:           rtmp2[col] = v2[j];
1704:           rtmp3[col] = v3[j];
1705:           rtmp4[col] = v4[j];
1706:         }
1707:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1708:         rtmp1[i] += sctx.shift_amount;
1709:         rtmp2[i + 1] += sctx.shift_amount;
1710:         rtmp3[i + 2] += sctx.shift_amount;
1711:         rtmp4[i + 3] += sctx.shift_amount;

1713:         /* elimination */
1714:         bjtmp = bj + bi[i];
1715:         row   = *bjtmp++; /* pivot row */
1716:         nzL   = bi[i + 1] - bi[i];
1717:         for (k = 0; k < nzL; k++) {
1718:           pc1 = rtmp1 + row;
1719:           pc2 = rtmp2 + row;
1720:           pc3 = rtmp3 + row;
1721:           pc4 = rtmp4 + row;
1722:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723:             pv   = b->a + bdiag[row];
1724:             mul1 = *pc1 * (*pv);
1725:             mul2 = *pc2 * (*pv);
1726:             mul3 = *pc3 * (*pv);
1727:             mul4 = *pc4 * (*pv);
1728:             *pc1 = mul1;
1729:             *pc2 = mul2;
1730:             *pc3 = mul3;
1731:             *pc4 = mul4;

1733:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1734:             pv = b->a + bdiag[row + 1] + 1;
1735:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1736:             for (j = 0; j < nz; j++) {
1737:               col = pj[j];
1738:               rtmp1[col] -= mul1 * pv[j];
1739:               rtmp2[col] -= mul2 * pv[j];
1740:               rtmp3[col] -= mul3 * pv[j];
1741:               rtmp4[col] -= mul4 * pv[j];
1742:             }
1743:             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1744:           }
1745:           row = *bjtmp++;
1746:         }

1748:         /* finished row i; check zero pivot, then stick row i into b->a */
1749:         rs = 0.0;
1750:         /* L part */
1751:         pc1 = b->a + bi[i];
1752:         pj  = b->j + bi[i];
1753:         nz  = bi[i + 1] - bi[i];
1754:         for (j = 0; j < nz; j++) {
1755:           col    = pj[j];
1756:           pc1[j] = rtmp1[col];
1757:           rs += PetscAbsScalar(pc1[j]);
1758:         }
1759:         /* U part */
1760:         pc1 = b->a + bdiag[i + 1] + 1;
1761:         pj  = b->j + bdiag[i + 1] + 1;
1762:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1763:         for (j = 0; j < nz; j++) {
1764:           col    = pj[j];
1765:           pc1[j] = rtmp1[col];
1766:           rs += PetscAbsScalar(pc1[j]);
1767:         }

1769:         sctx.rs = rs;
1770:         sctx.pv = rtmp1[i];
1771:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1772:         if (sctx.newshift) break;
1773:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1774:         *pc1 = 1.0 / sctx.pv;

1776:         /* Now take care of 1st column of diagonal 4x4 block. */
1777:         pc2 = rtmp2 + i;
1778:         pc3 = rtmp3 + i;
1779:         pc4 = rtmp4 + i;
1780:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1781:           mul2 = (*pc2) * (*pc1);
1782:           *pc2 = mul2;
1783:           mul3 = (*pc3) * (*pc1);
1784:           *pc3 = mul3;
1785:           mul4 = (*pc4) * (*pc1);
1786:           *pc4 = mul4;
1787:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1788:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1789:           for (j = 0; j < nz; j++) {
1790:             col = pj[j];
1791:             rtmp2[col] -= mul2 * rtmp1[col];
1792:             rtmp3[col] -= mul3 * rtmp1[col];
1793:             rtmp4[col] -= mul4 * rtmp1[col];
1794:           }
1795:           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1796:         }

1798:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1799:         rs = 0.0;
1800:         /* L part */
1801:         pc2 = b->a + bi[i + 1];
1802:         pj  = b->j + bi[i + 1];
1803:         nz  = bi[i + 2] - bi[i + 1];
1804:         for (j = 0; j < nz; j++) {
1805:           col    = pj[j];
1806:           pc2[j] = rtmp2[col];
1807:           rs += PetscAbsScalar(pc2[j]);
1808:         }
1809:         /* U part */
1810:         pc2 = b->a + bdiag[i + 2] + 1;
1811:         pj  = b->j + bdiag[i + 2] + 1;
1812:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1813:         for (j = 0; j < nz; j++) {
1814:           col    = pj[j];
1815:           pc2[j] = rtmp2[col];
1816:           rs += PetscAbsScalar(pc2[j]);
1817:         }

1819:         sctx.rs = rs;
1820:         sctx.pv = rtmp2[i + 1];
1821:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1822:         if (sctx.newshift) break;
1823:         pc2  = b->a + bdiag[i + 1];
1824:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1826:         /* Now take care of 2nd column of diagonal 4x4 block. */
1827:         pc3 = rtmp3 + i + 1;
1828:         pc4 = rtmp4 + i + 1;
1829:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1830:           mul3 = (*pc3) * (*pc2);
1831:           *pc3 = mul3;
1832:           mul4 = (*pc4) * (*pc2);
1833:           *pc4 = mul4;
1834:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1835:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1836:           for (j = 0; j < nz; j++) {
1837:             col = pj[j];
1838:             rtmp3[col] -= mul3 * rtmp2[col];
1839:             rtmp4[col] -= mul4 * rtmp2[col];
1840:           }
1841:           PetscCall(PetscLogFlops(4.0 * nz));
1842:         }

1844:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1845:         rs = 0.0;
1846:         /* L part */
1847:         pc3 = b->a + bi[i + 2];
1848:         pj  = b->j + bi[i + 2];
1849:         nz  = bi[i + 3] - bi[i + 2];
1850:         for (j = 0; j < nz; j++) {
1851:           col    = pj[j];
1852:           pc3[j] = rtmp3[col];
1853:           rs += PetscAbsScalar(pc3[j]);
1854:         }
1855:         /* U part */
1856:         pc3 = b->a + bdiag[i + 3] + 1;
1857:         pj  = b->j + bdiag[i + 3] + 1;
1858:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1859:         for (j = 0; j < nz; j++) {
1860:           col    = pj[j];
1861:           pc3[j] = rtmp3[col];
1862:           rs += PetscAbsScalar(pc3[j]);
1863:         }

1865:         sctx.rs = rs;
1866:         sctx.pv = rtmp3[i + 2];
1867:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1868:         if (sctx.newshift) break;
1869:         pc3  = b->a + bdiag[i + 2];
1870:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */

1872:         /* Now take care of 3rd column of diagonal 4x4 block. */
1873:         pc4 = rtmp4 + i + 2;
1874:         if (*pc4 != 0.0) {
1875:           mul4 = (*pc4) * (*pc3);
1876:           *pc4 = mul4;
1877:           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1878:           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1879:           for (j = 0; j < nz; j++) {
1880:             col = pj[j];
1881:             rtmp4[col] -= mul4 * rtmp3[col];
1882:           }
1883:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1884:         }

1886:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1887:         rs = 0.0;
1888:         /* L part */
1889:         pc4 = b->a + bi[i + 3];
1890:         pj  = b->j + bi[i + 3];
1891:         nz  = bi[i + 4] - bi[i + 3];
1892:         for (j = 0; j < nz; j++) {
1893:           col    = pj[j];
1894:           pc4[j] = rtmp4[col];
1895:           rs += PetscAbsScalar(pc4[j]);
1896:         }
1897:         /* U part */
1898:         pc4 = b->a + bdiag[i + 4] + 1;
1899:         pj  = b->j + bdiag[i + 4] + 1;
1900:         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1901:         for (j = 0; j < nz; j++) {
1902:           col    = pj[j];
1903:           pc4[j] = rtmp4[col];
1904:           rs += PetscAbsScalar(pc4[j]);
1905:         }

1907:         sctx.rs = rs;
1908:         sctx.pv = rtmp4[i + 3];
1909:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1910:         if (sctx.newshift) break;
1911:         pc4  = b->a + bdiag[i + 3];
1912:         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1913:         break;

1915:       default:
1916:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1917:       }
1918:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1919:       i += nodesz;              /* Update the row */
1920:     }

1922:     /* MatPivotRefine() */
1923:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1924:       /*
1925:        * if no shift in this attempt & shifting & started shifting & can refine,
1926:        * then try lower shift
1927:        */
1928:       sctx.shift_hi       = sctx.shift_fraction;
1929:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1930:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1931:       sctx.newshift       = PETSC_TRUE;
1932:       sctx.nshift++;
1933:     }
1934:   } while (sctx.newshift);

1936:   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1937:   PetscCall(PetscFree(tmp_vec2));
1938:   PetscCall(ISRestoreIndices(isicol, &ic));
1939:   PetscCall(ISRestoreIndices(isrow, &r));

1941:   if (b->inode.size) {
1942:     C->ops->solve = MatSolve_SeqAIJ_Inode;
1943:   } else {
1944:     C->ops->solve = MatSolve_SeqAIJ;
1945:   }
1946:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1947:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1948:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1949:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1950:   C->assembled              = PETSC_TRUE;
1951:   C->preallocated           = PETSC_TRUE;

1953:   PetscCall(PetscLogFlops(C->cmap->n));

1955:   /* MatShiftView(A,info,&sctx) */
1956:   if (sctx.nshift) {
1957:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1958:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1959:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1960:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1961:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1962:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1963:     }
1964:   }
1965:   PetscFunctionReturn(PETSC_SUCCESS);
1966: }

1968: #if 0
1969: // unused
1970: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1971: {
1972:   Mat              C = B;
1973:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1974:   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1975:   const PetscInt  *r, *ic, *c, *ics;
1976:   PetscInt         n = A->rmap->n, *bi = b->i;
1977:   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1978:   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1979:   PetscInt        *ai = a->i, *aj = a->j;
1980:   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1981:   PetscScalar      mul1, mul2, mul3, tmp;
1982:   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1983:   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1984:   PetscReal        rs = 0.0;
1985:   FactorShiftCtx   sctx;

1987:   PetscFunctionBegin;
1988:   sctx.shift_top      = 0;
1989:   sctx.nshift_max     = 0;
1990:   sctx.shift_lo       = 0;
1991:   sctx.shift_hi       = 0;
1992:   sctx.shift_fraction = 0;

1994:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1995:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1996:     sctx.shift_top = 0;
1997:     for (i = 0; i < n; i++) {
1998:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1999:       rs    = 0.0;
2000:       ajtmp = aj + ai[i];
2001:       rtmp1 = aa + ai[i];
2002:       nz    = ai[i + 1] - ai[i];
2003:       for (j = 0; j < nz; j++) {
2004:         if (*ajtmp != i) {
2005:           rs += PetscAbsScalar(*rtmp1++);
2006:         } else {
2007:           rs -= PetscRealPart(*rtmp1++);
2008:         }
2009:         ajtmp++;
2010:       }
2011:       if (rs > sctx.shift_top) sctx.shift_top = rs;
2012:     }
2013:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2014:     sctx.shift_top *= 1.1;
2015:     sctx.nshift_max = 5;
2016:     sctx.shift_lo   = 0.;
2017:     sctx.shift_hi   = 1.;
2018:   }
2019:   sctx.shift_amount = 0;
2020:   sctx.nshift       = 0;

2022:   PetscCall(ISGetIndices(isrow, &r));
2023:   PetscCall(ISGetIndices(iscol, &c));
2024:   PetscCall(ISGetIndices(isicol, &ic));
2025:   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2026:   ics = ic;

2028:   node_max = a->inode.node_count;
2029:   ns       = a->inode.size;
2030:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

2032:   /* If max inode size > 3, split it into two inodes.*/
2033:   /* also map the inode sizes according to the ordering */
2034:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2035:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2036:     if (ns[i] > 3) {
2037:       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2038:       ++j;
2039:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2040:     } else {
2041:       tmp_vec1[j] = ns[i];
2042:     }
2043:   }
2044:   /* Use the correct node_max */
2045:   node_max = j;

2047:   /* Now reorder the inode info based on mat re-ordering info */
2048:   /* First create a row -> inode_size_array_index map */
2049:   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2050:   for (i = 0, row = 0; i < node_max; i++) {
2051:     nodesz = tmp_vec1[i];
2052:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2053:   }
2054:   /* Using nsmap, create a reordered ns structure */
2055:   for (i = 0, j = 0; i < node_max; i++) {
2056:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2057:     tmp_vec2[i] = nodesz;
2058:     j += nodesz;
2059:   }
2060:   PetscCall(PetscFree2(nsmap, tmp_vec1));
2061:   /* Now use the correct ns */
2062:   ns = tmp_vec2;

2064:   do {
2065:     sctx.newshift = PETSC_FALSE;
2066:     /* Now loop over each block-row, and do the factorization */
2067:     for (i = 0, row = 0; i < node_max; i++) {
2068:       nodesz = ns[i];
2069:       nz     = bi[row + 1] - bi[row];
2070:       bjtmp  = bj + bi[row];

2072:       switch (nodesz) {
2073:       case 1:
2074:         for (j = 0; j < nz; j++) {
2075:           idx         = bjtmp[j];
2076:           rtmp11[idx] = 0.0;
2077:         }

2079:         /* load in initial (unfactored row) */
2080:         idx    = r[row];
2081:         nz_tmp = ai[idx + 1] - ai[idx];
2082:         ajtmp  = aj + ai[idx];
2083:         v1     = aa + ai[idx];

2085:         for (j = 0; j < nz_tmp; j++) {
2086:           idx         = ics[ajtmp[j]];
2087:           rtmp11[idx] = v1[j];
2088:         }
2089:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2091:         prow = *bjtmp++;
2092:         while (prow < row) {
2093:           pc1 = rtmp11 + prow;
2094:           if (*pc1 != 0.0) {
2095:             pv     = ba + bd[prow];
2096:             pj     = nbj + bd[prow];
2097:             mul1   = *pc1 * *pv++;
2098:             *pc1   = mul1;
2099:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2100:             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2101:             for (j = 0; j < nz_tmp; j++) {
2102:               tmp = pv[j];
2103:               idx = pj[j];
2104:               rtmp11[idx] -= mul1 * tmp;
2105:             }
2106:           }
2107:           prow = *bjtmp++;
2108:         }
2109:         pj  = bj + bi[row];
2110:         pc1 = ba + bi[row];

2112:         sctx.pv     = rtmp11[row];
2113:         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2114:         rs          = 0.0;
2115:         for (j = 0; j < nz; j++) {
2116:           idx    = pj[j];
2117:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2118:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2119:         }
2120:         sctx.rs = rs;
2121:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2122:         if (sctx.newshift) goto endofwhile;
2123:         break;

2125:       case 2:
2126:         for (j = 0; j < nz; j++) {
2127:           idx         = bjtmp[j];
2128:           rtmp11[idx] = 0.0;
2129:           rtmp22[idx] = 0.0;
2130:         }

2132:         /* load in initial (unfactored row) */
2133:         idx    = r[row];
2134:         nz_tmp = ai[idx + 1] - ai[idx];
2135:         ajtmp  = aj + ai[idx];
2136:         v1     = aa + ai[idx];
2137:         v2     = aa + ai[idx + 1];
2138:         for (j = 0; j < nz_tmp; j++) {
2139:           idx         = ics[ajtmp[j]];
2140:           rtmp11[idx] = v1[j];
2141:           rtmp22[idx] = v2[j];
2142:         }
2143:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2144:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;

2146:         prow = *bjtmp++;
2147:         while (prow < row) {
2148:           pc1 = rtmp11 + prow;
2149:           pc2 = rtmp22 + prow;
2150:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2151:             pv   = ba + bd[prow];
2152:             pj   = nbj + bd[prow];
2153:             mul1 = *pc1 * *pv;
2154:             mul2 = *pc2 * *pv;
2155:             ++pv;
2156:             *pc1 = mul1;
2157:             *pc2 = mul2;

2159:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2160:             for (j = 0; j < nz_tmp; j++) {
2161:               tmp = pv[j];
2162:               idx = pj[j];
2163:               rtmp11[idx] -= mul1 * tmp;
2164:               rtmp22[idx] -= mul2 * tmp;
2165:             }
2166:             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2167:           }
2168:           prow = *bjtmp++;
2169:         }

2171:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2172:         pc1 = rtmp11 + prow;
2173:         pc2 = rtmp22 + prow;

2175:         sctx.pv = *pc1;
2176:         pj      = bj + bi[prow];
2177:         rs      = 0.0;
2178:         for (j = 0; j < nz; j++) {
2179:           idx = pj[j];
2180:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2181:         }
2182:         sctx.rs = rs;
2183:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2184:         if (sctx.newshift) goto endofwhile;

2186:         if (*pc2 != 0.0) {
2187:           pj     = nbj + bd[prow];
2188:           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2189:           *pc2   = mul2;
2190:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2191:           for (j = 0; j < nz_tmp; j++) {
2192:             idx = pj[j];
2193:             tmp = rtmp11[idx];
2194:             rtmp22[idx] -= mul2 * tmp;
2195:           }
2196:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2197:         }

2199:         pj  = bj + bi[row];
2200:         pc1 = ba + bi[row];
2201:         pc2 = ba + bi[row + 1];

2203:         sctx.pv         = rtmp22[row + 1];
2204:         rs              = 0.0;
2205:         rtmp11[row]     = 1.0 / rtmp11[row];
2206:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2207:         /* copy row entries from dense representation to sparse */
2208:         for (j = 0; j < nz; j++) {
2209:           idx    = pj[j];
2210:           pc1[j] = rtmp11[idx];
2211:           pc2[j] = rtmp22[idx];
2212:           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2213:         }
2214:         sctx.rs = rs;
2215:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2216:         if (sctx.newshift) goto endofwhile;
2217:         break;

2219:       case 3:
2220:         for (j = 0; j < nz; j++) {
2221:           idx         = bjtmp[j];
2222:           rtmp11[idx] = 0.0;
2223:           rtmp22[idx] = 0.0;
2224:           rtmp33[idx] = 0.0;
2225:         }
2226:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2227:         idx    = r[row];
2228:         nz_tmp = ai[idx + 1] - ai[idx];
2229:         ajtmp  = aj + ai[idx];
2230:         v1     = aa + ai[idx];
2231:         v2     = aa + ai[idx + 1];
2232:         v3     = aa + ai[idx + 2];
2233:         for (j = 0; j < nz_tmp; j++) {
2234:           idx         = ics[ajtmp[j]];
2235:           rtmp11[idx] = v1[j];
2236:           rtmp22[idx] = v2[j];
2237:           rtmp33[idx] = v3[j];
2238:         }
2239:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2240:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2241:         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;

2243:         /* loop over all pivot row blocks above this row block */
2244:         prow = *bjtmp++;
2245:         while (prow < row) {
2246:           pc1 = rtmp11 + prow;
2247:           pc2 = rtmp22 + prow;
2248:           pc3 = rtmp33 + prow;
2249:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2250:             pv   = ba + bd[prow];
2251:             pj   = nbj + bd[prow];
2252:             mul1 = *pc1 * *pv;
2253:             mul2 = *pc2 * *pv;
2254:             mul3 = *pc3 * *pv;
2255:             ++pv;
2256:             *pc1 = mul1;
2257:             *pc2 = mul2;
2258:             *pc3 = mul3;

2260:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2261:             /* update this row based on pivot row */
2262:             for (j = 0; j < nz_tmp; j++) {
2263:               tmp = pv[j];
2264:               idx = pj[j];
2265:               rtmp11[idx] -= mul1 * tmp;
2266:               rtmp22[idx] -= mul2 * tmp;
2267:               rtmp33[idx] -= mul3 * tmp;
2268:             }
2269:             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2270:           }
2271:           prow = *bjtmp++;
2272:         }

2274:         /* Now take care of diagonal 3x3 block in this set of rows */
2275:         /* note: prow = row here */
2276:         pc1 = rtmp11 + prow;
2277:         pc2 = rtmp22 + prow;
2278:         pc3 = rtmp33 + prow;

2280:         sctx.pv = *pc1;
2281:         pj      = bj + bi[prow];
2282:         rs      = 0.0;
2283:         for (j = 0; j < nz; j++) {
2284:           idx = pj[j];
2285:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2286:         }
2287:         sctx.rs = rs;
2288:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2289:         if (sctx.newshift) goto endofwhile;

2291:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2292:           mul2   = (*pc2) / (*pc1);
2293:           mul3   = (*pc3) / (*pc1);
2294:           *pc2   = mul2;
2295:           *pc3   = mul3;
2296:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2297:           pj     = nbj + bd[prow];
2298:           for (j = 0; j < nz_tmp; j++) {
2299:             idx = pj[j];
2300:             tmp = rtmp11[idx];
2301:             rtmp22[idx] -= mul2 * tmp;
2302:             rtmp33[idx] -= mul3 * tmp;
2303:           }
2304:           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2305:         }
2306:         ++prow;

2308:         pc2     = rtmp22 + prow;
2309:         pc3     = rtmp33 + prow;
2310:         sctx.pv = *pc2;
2311:         pj      = bj + bi[prow];
2312:         rs      = 0.0;
2313:         for (j = 0; j < nz; j++) {
2314:           idx = pj[j];
2315:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2316:         }
2317:         sctx.rs = rs;
2318:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2319:         if (sctx.newshift) goto endofwhile;

2321:         if (*pc3 != 0.0) {
2322:           mul3   = (*pc3) / (*pc2);
2323:           *pc3   = mul3;
2324:           pj     = nbj + bd[prow];
2325:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2326:           for (j = 0; j < nz_tmp; j++) {
2327:             idx = pj[j];
2328:             tmp = rtmp22[idx];
2329:             rtmp33[idx] -= mul3 * tmp;
2330:           }
2331:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2332:         }

2334:         pj  = bj + bi[row];
2335:         pc1 = ba + bi[row];
2336:         pc2 = ba + bi[row + 1];
2337:         pc3 = ba + bi[row + 2];

2339:         sctx.pv         = rtmp33[row + 2];
2340:         rs              = 0.0;
2341:         rtmp11[row]     = 1.0 / rtmp11[row];
2342:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2343:         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2344:         /* copy row entries from dense representation to sparse */
2345:         for (j = 0; j < nz; j++) {
2346:           idx    = pj[j];
2347:           pc1[j] = rtmp11[idx];
2348:           pc2[j] = rtmp22[idx];
2349:           pc3[j] = rtmp33[idx];
2350:           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2351:         }

2353:         sctx.rs = rs;
2354:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2355:         if (sctx.newshift) goto endofwhile;
2356:         break;

2358:       default:
2359:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2360:       }
2361:       row += nodesz; /* Update the row */
2362:     }
2363:   endofwhile:;
2364:   } while (sctx.newshift);
2365:   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2366:   PetscCall(PetscFree(tmp_vec2));
2367:   PetscCall(ISRestoreIndices(isicol, &ic));
2368:   PetscCall(ISRestoreIndices(isrow, &r));
2369:   PetscCall(ISRestoreIndices(iscol, &c));

2371:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2372:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2373:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2374:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2375:   C->assembled              = PETSC_TRUE;
2376:   C->preallocated           = PETSC_TRUE;
2377:   if (sctx.nshift) {
2378:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2379:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2380:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2381:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2382:     }
2383:   }
2384:   PetscCall(PetscLogFlops(C->cmap->n));
2385:   PetscCall(MatSeqAIJCheckInode(C));
2386:   PetscFunctionReturn(PETSC_SUCCESS);
2387: }
2388: #endif

2390: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2391: {
2392:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2393:   IS                 iscol = a->col, isrow = a->row;
2394:   const PetscInt    *r, *c, *rout, *cout;
2395:   PetscInt           i, j, n = A->rmap->n;
2396:   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2397:   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2398:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2399:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2400:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2401:   const PetscScalar *b;

2403:   PetscFunctionBegin;
2404:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2405:   node_max = a->inode.node_count;
2406:   ns       = a->inode.size; /* Node Size array */

2408:   PetscCall(VecGetArrayRead(bb, &b));
2409:   PetscCall(VecGetArrayWrite(xx, &x));
2410:   tmp = a->solve_work;

2412:   PetscCall(ISGetIndices(isrow, &rout));
2413:   r = rout;
2414:   PetscCall(ISGetIndices(iscol, &cout));
2415:   c = cout;

2417:   /* forward solve the lower triangular */
2418:   tmps = tmp;
2419:   aa   = a_a;
2420:   aj   = a_j;
2421:   ad   = a->diag;

2423:   for (i = 0, row = 0; i < node_max; ++i) {
2424:     nsz = ns[i];
2425:     aii = ai[row];
2426:     v1  = aa + aii;
2427:     vi  = aj + aii;
2428:     nz  = ai[row + 1] - ai[row];

2430:     if (i < node_max - 1) {
2431:       /* Prefetch the indices for the next block */
2432:       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2433:       /* Prefetch the data for the next block */
2434:       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2435:     }

2437:     switch (nsz) { /* Each loop in 'case' is unrolled */
2438:     case 1:
2439:       sum1 = b[r[row]];
2440:       for (j = 0; j < nz - 1; j += 2) {
2441:         i0   = vi[j];
2442:         i1   = vi[j + 1];
2443:         tmp0 = tmps[i0];
2444:         tmp1 = tmps[i1];
2445:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2446:       }
2447:       if (j == nz - 1) {
2448:         tmp0 = tmps[vi[j]];
2449:         sum1 -= v1[j] * tmp0;
2450:       }
2451:       tmp[row++] = sum1;
2452:       break;
2453:     case 2:
2454:       sum1 = b[r[row]];
2455:       sum2 = b[r[row + 1]];
2456:       v2   = aa + ai[row + 1];

2458:       for (j = 0; j < nz - 1; j += 2) {
2459:         i0   = vi[j];
2460:         i1   = vi[j + 1];
2461:         tmp0 = tmps[i0];
2462:         tmp1 = tmps[i1];
2463:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2464:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2465:       }
2466:       if (j == nz - 1) {
2467:         tmp0 = tmps[vi[j]];
2468:         sum1 -= v1[j] * tmp0;
2469:         sum2 -= v2[j] * tmp0;
2470:       }
2471:       sum2 -= v2[nz] * sum1;
2472:       tmp[row++] = sum1;
2473:       tmp[row++] = sum2;
2474:       break;
2475:     case 3:
2476:       sum1 = b[r[row]];
2477:       sum2 = b[r[row + 1]];
2478:       sum3 = b[r[row + 2]];
2479:       v2   = aa + ai[row + 1];
2480:       v3   = aa + ai[row + 2];

2482:       for (j = 0; j < nz - 1; j += 2) {
2483:         i0   = vi[j];
2484:         i1   = vi[j + 1];
2485:         tmp0 = tmps[i0];
2486:         tmp1 = tmps[i1];
2487:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2488:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2489:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2490:       }
2491:       if (j == nz - 1) {
2492:         tmp0 = tmps[vi[j]];
2493:         sum1 -= v1[j] * tmp0;
2494:         sum2 -= v2[j] * tmp0;
2495:         sum3 -= v3[j] * tmp0;
2496:       }
2497:       sum2 -= v2[nz] * sum1;
2498:       sum3 -= v3[nz] * sum1;
2499:       sum3 -= v3[nz + 1] * sum2;
2500:       tmp[row++] = sum1;
2501:       tmp[row++] = sum2;
2502:       tmp[row++] = sum3;
2503:       break;

2505:     case 4:
2506:       sum1 = b[r[row]];
2507:       sum2 = b[r[row + 1]];
2508:       sum3 = b[r[row + 2]];
2509:       sum4 = b[r[row + 3]];
2510:       v2   = aa + ai[row + 1];
2511:       v3   = aa + ai[row + 2];
2512:       v4   = aa + ai[row + 3];

2514:       for (j = 0; j < nz - 1; j += 2) {
2515:         i0   = vi[j];
2516:         i1   = vi[j + 1];
2517:         tmp0 = tmps[i0];
2518:         tmp1 = tmps[i1];
2519:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2520:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2521:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2522:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2523:       }
2524:       if (j == nz - 1) {
2525:         tmp0 = tmps[vi[j]];
2526:         sum1 -= v1[j] * tmp0;
2527:         sum2 -= v2[j] * tmp0;
2528:         sum3 -= v3[j] * tmp0;
2529:         sum4 -= v4[j] * tmp0;
2530:       }
2531:       sum2 -= v2[nz] * sum1;
2532:       sum3 -= v3[nz] * sum1;
2533:       sum4 -= v4[nz] * sum1;
2534:       sum3 -= v3[nz + 1] * sum2;
2535:       sum4 -= v4[nz + 1] * sum2;
2536:       sum4 -= v4[nz + 2] * sum3;

2538:       tmp[row++] = sum1;
2539:       tmp[row++] = sum2;
2540:       tmp[row++] = sum3;
2541:       tmp[row++] = sum4;
2542:       break;
2543:     case 5:
2544:       sum1 = b[r[row]];
2545:       sum2 = b[r[row + 1]];
2546:       sum3 = b[r[row + 2]];
2547:       sum4 = b[r[row + 3]];
2548:       sum5 = b[r[row + 4]];
2549:       v2   = aa + ai[row + 1];
2550:       v3   = aa + ai[row + 2];
2551:       v4   = aa + ai[row + 3];
2552:       v5   = aa + ai[row + 4];

2554:       for (j = 0; j < nz - 1; j += 2) {
2555:         i0   = vi[j];
2556:         i1   = vi[j + 1];
2557:         tmp0 = tmps[i0];
2558:         tmp1 = tmps[i1];
2559:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2560:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2561:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2562:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2563:         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2564:       }
2565:       if (j == nz - 1) {
2566:         tmp0 = tmps[vi[j]];
2567:         sum1 -= v1[j] * tmp0;
2568:         sum2 -= v2[j] * tmp0;
2569:         sum3 -= v3[j] * tmp0;
2570:         sum4 -= v4[j] * tmp0;
2571:         sum5 -= v5[j] * tmp0;
2572:       }

2574:       sum2 -= v2[nz] * sum1;
2575:       sum3 -= v3[nz] * sum1;
2576:       sum4 -= v4[nz] * sum1;
2577:       sum5 -= v5[nz] * sum1;
2578:       sum3 -= v3[nz + 1] * sum2;
2579:       sum4 -= v4[nz + 1] * sum2;
2580:       sum5 -= v5[nz + 1] * sum2;
2581:       sum4 -= v4[nz + 2] * sum3;
2582:       sum5 -= v5[nz + 2] * sum3;
2583:       sum5 -= v5[nz + 3] * sum4;

2585:       tmp[row++] = sum1;
2586:       tmp[row++] = sum2;
2587:       tmp[row++] = sum3;
2588:       tmp[row++] = sum4;
2589:       tmp[row++] = sum5;
2590:       break;
2591:     default:
2592:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2593:     }
2594:   }
2595:   /* backward solve the upper triangular */
2596:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2597:     nsz = ns[i];
2598:     aii = ad[row + 1] + 1;
2599:     v1  = aa + aii;
2600:     vi  = aj + aii;
2601:     nz  = ad[row] - ad[row + 1] - 1;

2603:     if (i > 0) {
2604:       /* Prefetch the indices for the next block */
2605:       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2606:       /* Prefetch the data for the next block */
2607:       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2608:     }

2610:     switch (nsz) { /* Each loop in 'case' is unrolled */
2611:     case 1:
2612:       sum1 = tmp[row];

2614:       for (j = 0; j < nz - 1; j += 2) {
2615:         i0   = vi[j];
2616:         i1   = vi[j + 1];
2617:         tmp0 = tmps[i0];
2618:         tmp1 = tmps[i1];
2619:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2620:       }
2621:       if (j == nz - 1) {
2622:         tmp0 = tmps[vi[j]];
2623:         sum1 -= v1[j] * tmp0;
2624:       }
2625:       x[c[row]] = tmp[row] = sum1 * v1[nz];
2626:       row--;
2627:       break;
2628:     case 2:
2629:       sum1 = tmp[row];
2630:       sum2 = tmp[row - 1];
2631:       v2   = aa + ad[row] + 1;
2632:       for (j = 0; j < nz - 1; j += 2) {
2633:         i0   = vi[j];
2634:         i1   = vi[j + 1];
2635:         tmp0 = tmps[i0];
2636:         tmp1 = tmps[i1];
2637:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2638:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2639:       }
2640:       if (j == nz - 1) {
2641:         tmp0 = tmps[vi[j]];
2642:         sum1 -= v1[j] * tmp0;
2643:         sum2 -= v2[j + 1] * tmp0;
2644:       }

2646:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2647:       row--;
2648:       sum2 -= v2[0] * tmp0;
2649:       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2650:       row--;
2651:       break;
2652:     case 3:
2653:       sum1 = tmp[row];
2654:       sum2 = tmp[row - 1];
2655:       sum3 = tmp[row - 2];
2656:       v2   = aa + ad[row] + 1;
2657:       v3   = aa + ad[row - 1] + 1;
2658:       for (j = 0; j < nz - 1; j += 2) {
2659:         i0   = vi[j];
2660:         i1   = vi[j + 1];
2661:         tmp0 = tmps[i0];
2662:         tmp1 = tmps[i1];
2663:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2664:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2665:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2666:       }
2667:       if (j == nz - 1) {
2668:         tmp0 = tmps[vi[j]];
2669:         sum1 -= v1[j] * tmp0;
2670:         sum2 -= v2[j + 1] * tmp0;
2671:         sum3 -= v3[j + 2] * tmp0;
2672:       }
2673:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2674:       row--;
2675:       sum2 -= v2[0] * tmp0;
2676:       sum3 -= v3[1] * tmp0;
2677:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2678:       row--;
2679:       sum3 -= v3[0] * tmp0;
2680:       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2681:       row--;

2683:       break;
2684:     case 4:
2685:       sum1 = tmp[row];
2686:       sum2 = tmp[row - 1];
2687:       sum3 = tmp[row - 2];
2688:       sum4 = tmp[row - 3];
2689:       v2   = aa + ad[row] + 1;
2690:       v3   = aa + ad[row - 1] + 1;
2691:       v4   = aa + ad[row - 2] + 1;

2693:       for (j = 0; j < nz - 1; j += 2) {
2694:         i0   = vi[j];
2695:         i1   = vi[j + 1];
2696:         tmp0 = tmps[i0];
2697:         tmp1 = tmps[i1];
2698:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2699:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2700:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2701:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2702:       }
2703:       if (j == nz - 1) {
2704:         tmp0 = tmps[vi[j]];
2705:         sum1 -= v1[j] * tmp0;
2706:         sum2 -= v2[j + 1] * tmp0;
2707:         sum3 -= v3[j + 2] * tmp0;
2708:         sum4 -= v4[j + 3] * tmp0;
2709:       }

2711:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2712:       row--;
2713:       sum2 -= v2[0] * tmp0;
2714:       sum3 -= v3[1] * tmp0;
2715:       sum4 -= v4[2] * tmp0;
2716:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2717:       row--;
2718:       sum3 -= v3[0] * tmp0;
2719:       sum4 -= v4[1] * tmp0;
2720:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2721:       row--;
2722:       sum4 -= v4[0] * tmp0;
2723:       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2724:       row--;
2725:       break;
2726:     case 5:
2727:       sum1 = tmp[row];
2728:       sum2 = tmp[row - 1];
2729:       sum3 = tmp[row - 2];
2730:       sum4 = tmp[row - 3];
2731:       sum5 = tmp[row - 4];
2732:       v2   = aa + ad[row] + 1;
2733:       v3   = aa + ad[row - 1] + 1;
2734:       v4   = aa + ad[row - 2] + 1;
2735:       v5   = aa + ad[row - 3] + 1;
2736:       for (j = 0; j < nz - 1; j += 2) {
2737:         i0   = vi[j];
2738:         i1   = vi[j + 1];
2739:         tmp0 = tmps[i0];
2740:         tmp1 = tmps[i1];
2741:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2742:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2743:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2744:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2745:         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2746:       }
2747:       if (j == nz - 1) {
2748:         tmp0 = tmps[vi[j]];
2749:         sum1 -= v1[j] * tmp0;
2750:         sum2 -= v2[j + 1] * tmp0;
2751:         sum3 -= v3[j + 2] * tmp0;
2752:         sum4 -= v4[j + 3] * tmp0;
2753:         sum5 -= v5[j + 4] * tmp0;
2754:       }

2756:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2757:       row--;
2758:       sum2 -= v2[0] * tmp0;
2759:       sum3 -= v3[1] * tmp0;
2760:       sum4 -= v4[2] * tmp0;
2761:       sum5 -= v5[3] * tmp0;
2762:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2763:       row--;
2764:       sum3 -= v3[0] * tmp0;
2765:       sum4 -= v4[1] * tmp0;
2766:       sum5 -= v5[2] * tmp0;
2767:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2768:       row--;
2769:       sum4 -= v4[0] * tmp0;
2770:       sum5 -= v5[1] * tmp0;
2771:       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2772:       row--;
2773:       sum5 -= v5[0] * tmp0;
2774:       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2775:       row--;
2776:       break;
2777:     default:
2778:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2779:     }
2780:   }
2781:   PetscCall(ISRestoreIndices(isrow, &rout));
2782:   PetscCall(ISRestoreIndices(iscol, &cout));
2783:   PetscCall(VecRestoreArrayRead(bb, &b));
2784:   PetscCall(VecRestoreArrayWrite(xx, &x));
2785:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2786:   PetscFunctionReturn(PETSC_SUCCESS);
2787: }

2789: /*
2790:      Makes a longer coloring[] array and calls the usual code with that
2791: */
2792: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2793: {
2794:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2795:   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2796:   PetscInt        *colorused, i;
2797:   ISColoringValue *newcolor;

2799:   PetscFunctionBegin;
2800:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2801:   PetscCall(PetscMalloc1(n + 1, &newcolor));
2802:   /* loop over inodes, marking a color for each column*/
2803:   row = 0;
2804:   for (i = 0; i < m; i++) {
2805:     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2806:   }

2808:   /* eliminate unneeded colors */
2809:   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2810:   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;

2812:   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2813:   ncolors = colorused[5 * ncolors - 1];
2814:   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2815:   PetscCall(PetscFree(colorused));
2816:   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2817:   PetscCall(PetscFree(coloring));
2818:   PetscFunctionReturn(PETSC_SUCCESS);
2819: }

2821: #include <petsc/private/kernels/blockinvert.h>

2823: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2824: {
2825:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2826:   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2827:   MatScalar         *ibdiag, *bdiag, work[25], *t;
2828:   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2829:   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2830:   const PetscScalar *xb, *b;
2831:   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2832:   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2833:   PetscInt           sz, k, ipvt[5];
2834:   PetscBool          allowzeropivot, zeropivotdetected;
2835:   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;

2837:   PetscFunctionBegin;
2838:   allowzeropivot = PetscNot(A->erroriffailure);
2839:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2840:   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2841:   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");

2843:   if (!a->inode.ibdiagvalid) {
2844:     if (!a->inode.ibdiag) {
2845:       /* calculate space needed for diagonal blocks */
2846:       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2847:       a->inode.bdiagsize = cnt;

2849:       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2850:     }

2852:     /* copy over the diagonal blocks and invert them */
2853:     ibdiag = a->inode.ibdiag;
2854:     bdiag  = a->inode.bdiag;
2855:     cnt    = 0;
2856:     for (i = 0, row = 0; i < m; i++) {
2857:       for (j = 0; j < sizes[i]; j++) {
2858:         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2859:       }
2860:       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));

2862:       switch (sizes[i]) {
2863:       case 1:
2864:         /* Create matrix data structure */
2865:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2866:           if (allowzeropivot) {
2867:             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2868:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2869:             A->factorerror_zeropivot_row   = row;
2870:             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2871:           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2872:         }
2873:         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2874:         break;
2875:       case 2:
2876:         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2877:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2878:         break;
2879:       case 3:
2880:         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2881:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2882:         break;
2883:       case 4:
2884:         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2885:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2886:         break;
2887:       case 5:
2888:         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2889:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2890:         break;
2891:       default:
2892:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2893:       }
2894:       cnt += sizes[i] * sizes[i];
2895:       row += sizes[i];
2896:     }
2897:     a->inode.ibdiagvalid = PETSC_TRUE;
2898:   }
2899:   ibdiag = a->inode.ibdiag;
2900:   bdiag  = a->inode.bdiag;
2901:   t      = a->inode.ssor_work;

2903:   PetscCall(VecGetArray(xx, &x));
2904:   PetscCall(VecGetArrayRead(bb, &b));
2905:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2906:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2907:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2908:       for (i = 0, row = 0; i < m; i++) {
2909:         sz  = diag[row] - ii[row];
2910:         v1  = a->a + ii[row];
2911:         idx = a->j + ii[row];

2913:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2914:         switch (sizes[i]) {
2915:         case 1:

2917:           sum1 = b[row];
2918:           for (n = 0; n < sz - 1; n += 2) {
2919:             i1 = idx[0];
2920:             i2 = idx[1];
2921:             idx += 2;
2922:             tmp0 = x[i1];
2923:             tmp1 = x[i2];
2924:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2925:             v1 += 2;
2926:           }

2928:           if (n == sz - 1) {
2929:             tmp0 = x[*idx];
2930:             sum1 -= *v1 * tmp0;
2931:           }
2932:           t[row]   = sum1;
2933:           x[row++] = sum1 * (*ibdiag++);
2934:           break;
2935:         case 2:
2936:           v2   = a->a + ii[row + 1];
2937:           sum1 = b[row];
2938:           sum2 = b[row + 1];
2939:           for (n = 0; n < sz - 1; n += 2) {
2940:             i1 = idx[0];
2941:             i2 = idx[1];
2942:             idx += 2;
2943:             tmp0 = x[i1];
2944:             tmp1 = x[i2];
2945:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946:             v1 += 2;
2947:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2948:             v2 += 2;
2949:           }

2951:           if (n == sz - 1) {
2952:             tmp0 = x[*idx];
2953:             sum1 -= v1[0] * tmp0;
2954:             sum2 -= v2[0] * tmp0;
2955:           }
2956:           t[row]     = sum1;
2957:           t[row + 1] = sum2;
2958:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2959:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2960:           ibdiag += 4;
2961:           break;
2962:         case 3:
2963:           v2   = a->a + ii[row + 1];
2964:           v3   = a->a + ii[row + 2];
2965:           sum1 = b[row];
2966:           sum2 = b[row + 1];
2967:           sum3 = b[row + 2];
2968:           for (n = 0; n < sz - 1; n += 2) {
2969:             i1 = idx[0];
2970:             i2 = idx[1];
2971:             idx += 2;
2972:             tmp0 = x[i1];
2973:             tmp1 = x[i2];
2974:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2975:             v1 += 2;
2976:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2977:             v2 += 2;
2978:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2979:             v3 += 2;
2980:           }

2982:           if (n == sz - 1) {
2983:             tmp0 = x[*idx];
2984:             sum1 -= v1[0] * tmp0;
2985:             sum2 -= v2[0] * tmp0;
2986:             sum3 -= v3[0] * tmp0;
2987:           }
2988:           t[row]     = sum1;
2989:           t[row + 1] = sum2;
2990:           t[row + 2] = sum3;
2991:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2992:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2993:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2994:           ibdiag += 9;
2995:           break;
2996:         case 4:
2997:           v2   = a->a + ii[row + 1];
2998:           v3   = a->a + ii[row + 2];
2999:           v4   = a->a + ii[row + 3];
3000:           sum1 = b[row];
3001:           sum2 = b[row + 1];
3002:           sum3 = b[row + 2];
3003:           sum4 = b[row + 3];
3004:           for (n = 0; n < sz - 1; n += 2) {
3005:             i1 = idx[0];
3006:             i2 = idx[1];
3007:             idx += 2;
3008:             tmp0 = x[i1];
3009:             tmp1 = x[i2];
3010:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3011:             v1 += 2;
3012:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3013:             v2 += 2;
3014:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3015:             v3 += 2;
3016:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3017:             v4 += 2;
3018:           }

3020:           if (n == sz - 1) {
3021:             tmp0 = x[*idx];
3022:             sum1 -= v1[0] * tmp0;
3023:             sum2 -= v2[0] * tmp0;
3024:             sum3 -= v3[0] * tmp0;
3025:             sum4 -= v4[0] * tmp0;
3026:           }
3027:           t[row]     = sum1;
3028:           t[row + 1] = sum2;
3029:           t[row + 2] = sum3;
3030:           t[row + 3] = sum4;
3031:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3032:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3033:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3034:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3035:           ibdiag += 16;
3036:           break;
3037:         case 5:
3038:           v2   = a->a + ii[row + 1];
3039:           v3   = a->a + ii[row + 2];
3040:           v4   = a->a + ii[row + 3];
3041:           v5   = a->a + ii[row + 4];
3042:           sum1 = b[row];
3043:           sum2 = b[row + 1];
3044:           sum3 = b[row + 2];
3045:           sum4 = b[row + 3];
3046:           sum5 = b[row + 4];
3047:           for (n = 0; n < sz - 1; n += 2) {
3048:             i1 = idx[0];
3049:             i2 = idx[1];
3050:             idx += 2;
3051:             tmp0 = x[i1];
3052:             tmp1 = x[i2];
3053:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3054:             v1 += 2;
3055:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3056:             v2 += 2;
3057:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3058:             v3 += 2;
3059:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3060:             v4 += 2;
3061:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3062:             v5 += 2;
3063:           }

3065:           if (n == sz - 1) {
3066:             tmp0 = x[*idx];
3067:             sum1 -= v1[0] * tmp0;
3068:             sum2 -= v2[0] * tmp0;
3069:             sum3 -= v3[0] * tmp0;
3070:             sum4 -= v4[0] * tmp0;
3071:             sum5 -= v5[0] * tmp0;
3072:           }
3073:           t[row]     = sum1;
3074:           t[row + 1] = sum2;
3075:           t[row + 2] = sum3;
3076:           t[row + 3] = sum4;
3077:           t[row + 4] = sum5;
3078:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3079:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3080:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3081:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3082:           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3083:           ibdiag += 25;
3084:           break;
3085:         default:
3086:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3087:         }
3088:       }

3090:       xb = t;
3091:       PetscCall(PetscLogFlops(a->nz));
3092:     } else xb = b;
3093:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3094:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3095:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3096:         ibdiag -= sizes[i] * sizes[i];
3097:         sz  = ii[row + 1] - diag[row] - 1;
3098:         v1  = a->a + diag[row] + 1;
3099:         idx = a->j + diag[row] + 1;

3101:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3102:         switch (sizes[i]) {
3103:         case 1:

3105:           sum1 = xb[row];
3106:           for (n = 0; n < sz - 1; n += 2) {
3107:             i1 = idx[0];
3108:             i2 = idx[1];
3109:             idx += 2;
3110:             tmp0 = x[i1];
3111:             tmp1 = x[i2];
3112:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3113:             v1 += 2;
3114:           }

3116:           if (n == sz - 1) {
3117:             tmp0 = x[*idx];
3118:             sum1 -= *v1 * tmp0;
3119:           }
3120:           x[row--] = sum1 * (*ibdiag);
3121:           break;

3123:         case 2:

3125:           sum1 = xb[row];
3126:           sum2 = xb[row - 1];
3127:           /* note that sum1 is associated with the second of the two rows */
3128:           v2 = a->a + diag[row - 1] + 2;
3129:           for (n = 0; n < sz - 1; n += 2) {
3130:             i1 = idx[0];
3131:             i2 = idx[1];
3132:             idx += 2;
3133:             tmp0 = x[i1];
3134:             tmp1 = x[i2];
3135:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3136:             v1 += 2;
3137:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3138:             v2 += 2;
3139:           }

3141:           if (n == sz - 1) {
3142:             tmp0 = x[*idx];
3143:             sum1 -= *v1 * tmp0;
3144:             sum2 -= *v2 * tmp0;
3145:           }
3146:           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3147:           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3148:           break;
3149:         case 3:

3151:           sum1 = xb[row];
3152:           sum2 = xb[row - 1];
3153:           sum3 = xb[row - 2];
3154:           v2   = a->a + diag[row - 1] + 2;
3155:           v3   = a->a + diag[row - 2] + 3;
3156:           for (n = 0; n < sz - 1; n += 2) {
3157:             i1 = idx[0];
3158:             i2 = idx[1];
3159:             idx += 2;
3160:             tmp0 = x[i1];
3161:             tmp1 = x[i2];
3162:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3163:             v1 += 2;
3164:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3165:             v2 += 2;
3166:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3167:             v3 += 2;
3168:           }

3170:           if (n == sz - 1) {
3171:             tmp0 = x[*idx];
3172:             sum1 -= *v1 * tmp0;
3173:             sum2 -= *v2 * tmp0;
3174:             sum3 -= *v3 * tmp0;
3175:           }
3176:           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3177:           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3178:           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3179:           break;
3180:         case 4:

3182:           sum1 = xb[row];
3183:           sum2 = xb[row - 1];
3184:           sum3 = xb[row - 2];
3185:           sum4 = xb[row - 3];
3186:           v2   = a->a + diag[row - 1] + 2;
3187:           v3   = a->a + diag[row - 2] + 3;
3188:           v4   = a->a + diag[row - 3] + 4;
3189:           for (n = 0; n < sz - 1; n += 2) {
3190:             i1 = idx[0];
3191:             i2 = idx[1];
3192:             idx += 2;
3193:             tmp0 = x[i1];
3194:             tmp1 = x[i2];
3195:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3196:             v1 += 2;
3197:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3198:             v2 += 2;
3199:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3200:             v3 += 2;
3201:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3202:             v4 += 2;
3203:           }

3205:           if (n == sz - 1) {
3206:             tmp0 = x[*idx];
3207:             sum1 -= *v1 * tmp0;
3208:             sum2 -= *v2 * tmp0;
3209:             sum3 -= *v3 * tmp0;
3210:             sum4 -= *v4 * tmp0;
3211:           }
3212:           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3213:           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3214:           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3215:           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3216:           break;
3217:         case 5:

3219:           sum1 = xb[row];
3220:           sum2 = xb[row - 1];
3221:           sum3 = xb[row - 2];
3222:           sum4 = xb[row - 3];
3223:           sum5 = xb[row - 4];
3224:           v2   = a->a + diag[row - 1] + 2;
3225:           v3   = a->a + diag[row - 2] + 3;
3226:           v4   = a->a + diag[row - 3] + 4;
3227:           v5   = a->a + diag[row - 4] + 5;
3228:           for (n = 0; n < sz - 1; n += 2) {
3229:             i1 = idx[0];
3230:             i2 = idx[1];
3231:             idx += 2;
3232:             tmp0 = x[i1];
3233:             tmp1 = x[i2];
3234:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3235:             v1 += 2;
3236:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3237:             v2 += 2;
3238:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3239:             v3 += 2;
3240:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3241:             v4 += 2;
3242:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3243:             v5 += 2;
3244:           }

3246:           if (n == sz - 1) {
3247:             tmp0 = x[*idx];
3248:             sum1 -= *v1 * tmp0;
3249:             sum2 -= *v2 * tmp0;
3250:             sum3 -= *v3 * tmp0;
3251:             sum4 -= *v4 * tmp0;
3252:             sum5 -= *v5 * tmp0;
3253:           }
3254:           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3255:           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3256:           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3257:           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3258:           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3259:           break;
3260:         default:
3261:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3262:         }
3263:       }

3265:       PetscCall(PetscLogFlops(a->nz));
3266:     }
3267:     its--;
3268:   }
3269:   while (its--) {
3270:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3271:       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3272:         sz  = diag[row] - ii[row];
3273:         v1  = a->a + ii[row];
3274:         idx = a->j + ii[row];
3275:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3276:         switch (sizes[i]) {
3277:         case 1:
3278:           sum1 = b[row];
3279:           for (n = 0; n < sz - 1; n += 2) {
3280:             i1 = idx[0];
3281:             i2 = idx[1];
3282:             idx += 2;
3283:             tmp0 = x[i1];
3284:             tmp1 = x[i2];
3285:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3286:             v1 += 2;
3287:           }
3288:           if (n == sz - 1) {
3289:             tmp0 = x[*idx++];
3290:             sum1 -= *v1 * tmp0;
3291:             v1++;
3292:           }
3293:           t[row] = sum1;
3294:           sz     = ii[row + 1] - diag[row] - 1;
3295:           idx    = a->j + diag[row] + 1;
3296:           v1 += 1;
3297:           for (n = 0; n < sz - 1; n += 2) {
3298:             i1 = idx[0];
3299:             i2 = idx[1];
3300:             idx += 2;
3301:             tmp0 = x[i1];
3302:             tmp1 = x[i2];
3303:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3304:             v1 += 2;
3305:           }
3306:           if (n == sz - 1) {
3307:             tmp0 = x[*idx++];
3308:             sum1 -= *v1 * tmp0;
3309:           }
3310:           /* in MatSOR_SeqAIJ this line would be
3311:            *
3312:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3313:            *
3314:            * but omega == 1, so this becomes
3315:            *
3316:            * x[row] = sum1*(*ibdiag++);
3317:            *
3318:            */
3319:           x[row] = sum1 * (*ibdiag);
3320:           break;
3321:         case 2:
3322:           v2   = a->a + ii[row + 1];
3323:           sum1 = b[row];
3324:           sum2 = b[row + 1];
3325:           for (n = 0; n < sz - 1; n += 2) {
3326:             i1 = idx[0];
3327:             i2 = idx[1];
3328:             idx += 2;
3329:             tmp0 = x[i1];
3330:             tmp1 = x[i2];
3331:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3332:             v1 += 2;
3333:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3334:             v2 += 2;
3335:           }
3336:           if (n == sz - 1) {
3337:             tmp0 = x[*idx++];
3338:             sum1 -= v1[0] * tmp0;
3339:             sum2 -= v2[0] * tmp0;
3340:             v1++;
3341:             v2++;
3342:           }
3343:           t[row]     = sum1;
3344:           t[row + 1] = sum2;
3345:           sz         = ii[row + 1] - diag[row] - 2;
3346:           idx        = a->j + diag[row] + 2;
3347:           v1 += 2;
3348:           v2 += 2;
3349:           for (n = 0; n < sz - 1; n += 2) {
3350:             i1 = idx[0];
3351:             i2 = idx[1];
3352:             idx += 2;
3353:             tmp0 = x[i1];
3354:             tmp1 = x[i2];
3355:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3356:             v1 += 2;
3357:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3358:             v2 += 2;
3359:           }
3360:           if (n == sz - 1) {
3361:             tmp0 = x[*idx];
3362:             sum1 -= v1[0] * tmp0;
3363:             sum2 -= v2[0] * tmp0;
3364:           }
3365:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3366:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3367:           break;
3368:         case 3:
3369:           v2   = a->a + ii[row + 1];
3370:           v3   = a->a + ii[row + 2];
3371:           sum1 = b[row];
3372:           sum2 = b[row + 1];
3373:           sum3 = b[row + 2];
3374:           for (n = 0; n < sz - 1; n += 2) {
3375:             i1 = idx[0];
3376:             i2 = idx[1];
3377:             idx += 2;
3378:             tmp0 = x[i1];
3379:             tmp1 = x[i2];
3380:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3381:             v1 += 2;
3382:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3383:             v2 += 2;
3384:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3385:             v3 += 2;
3386:           }
3387:           if (n == sz - 1) {
3388:             tmp0 = x[*idx++];
3389:             sum1 -= v1[0] * tmp0;
3390:             sum2 -= v2[0] * tmp0;
3391:             sum3 -= v3[0] * tmp0;
3392:             v1++;
3393:             v2++;
3394:             v3++;
3395:           }
3396:           t[row]     = sum1;
3397:           t[row + 1] = sum2;
3398:           t[row + 2] = sum3;
3399:           sz         = ii[row + 1] - diag[row] - 3;
3400:           idx        = a->j + diag[row] + 3;
3401:           v1 += 3;
3402:           v2 += 3;
3403:           v3 += 3;
3404:           for (n = 0; n < sz - 1; n += 2) {
3405:             i1 = idx[0];
3406:             i2 = idx[1];
3407:             idx += 2;
3408:             tmp0 = x[i1];
3409:             tmp1 = x[i2];
3410:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3411:             v1 += 2;
3412:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3413:             v2 += 2;
3414:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3415:             v3 += 2;
3416:           }
3417:           if (n == sz - 1) {
3418:             tmp0 = x[*idx];
3419:             sum1 -= v1[0] * tmp0;
3420:             sum2 -= v2[0] * tmp0;
3421:             sum3 -= v3[0] * tmp0;
3422:           }
3423:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3424:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3425:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3426:           break;
3427:         case 4:
3428:           v2   = a->a + ii[row + 1];
3429:           v3   = a->a + ii[row + 2];
3430:           v4   = a->a + ii[row + 3];
3431:           sum1 = b[row];
3432:           sum2 = b[row + 1];
3433:           sum3 = b[row + 2];
3434:           sum4 = b[row + 3];
3435:           for (n = 0; n < sz - 1; n += 2) {
3436:             i1 = idx[0];
3437:             i2 = idx[1];
3438:             idx += 2;
3439:             tmp0 = x[i1];
3440:             tmp1 = x[i2];
3441:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3442:             v1 += 2;
3443:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3444:             v2 += 2;
3445:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3446:             v3 += 2;
3447:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3448:             v4 += 2;
3449:           }
3450:           if (n == sz - 1) {
3451:             tmp0 = x[*idx++];
3452:             sum1 -= v1[0] * tmp0;
3453:             sum2 -= v2[0] * tmp0;
3454:             sum3 -= v3[0] * tmp0;
3455:             sum4 -= v4[0] * tmp0;
3456:             v1++;
3457:             v2++;
3458:             v3++;
3459:             v4++;
3460:           }
3461:           t[row]     = sum1;
3462:           t[row + 1] = sum2;
3463:           t[row + 2] = sum3;
3464:           t[row + 3] = sum4;
3465:           sz         = ii[row + 1] - diag[row] - 4;
3466:           idx        = a->j + diag[row] + 4;
3467:           v1 += 4;
3468:           v2 += 4;
3469:           v3 += 4;
3470:           v4 += 4;
3471:           for (n = 0; n < sz - 1; n += 2) {
3472:             i1 = idx[0];
3473:             i2 = idx[1];
3474:             idx += 2;
3475:             tmp0 = x[i1];
3476:             tmp1 = x[i2];
3477:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3478:             v1 += 2;
3479:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3480:             v2 += 2;
3481:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3482:             v3 += 2;
3483:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3484:             v4 += 2;
3485:           }
3486:           if (n == sz - 1) {
3487:             tmp0 = x[*idx];
3488:             sum1 -= v1[0] * tmp0;
3489:             sum2 -= v2[0] * tmp0;
3490:             sum3 -= v3[0] * tmp0;
3491:             sum4 -= v4[0] * tmp0;
3492:           }
3493:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3494:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3495:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3496:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3497:           break;
3498:         case 5:
3499:           v2   = a->a + ii[row + 1];
3500:           v3   = a->a + ii[row + 2];
3501:           v4   = a->a + ii[row + 3];
3502:           v5   = a->a + ii[row + 4];
3503:           sum1 = b[row];
3504:           sum2 = b[row + 1];
3505:           sum3 = b[row + 2];
3506:           sum4 = b[row + 3];
3507:           sum5 = b[row + 4];
3508:           for (n = 0; n < sz - 1; n += 2) {
3509:             i1 = idx[0];
3510:             i2 = idx[1];
3511:             idx += 2;
3512:             tmp0 = x[i1];
3513:             tmp1 = x[i2];
3514:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3515:             v1 += 2;
3516:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3517:             v2 += 2;
3518:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3519:             v3 += 2;
3520:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3521:             v4 += 2;
3522:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3523:             v5 += 2;
3524:           }
3525:           if (n == sz - 1) {
3526:             tmp0 = x[*idx++];
3527:             sum1 -= v1[0] * tmp0;
3528:             sum2 -= v2[0] * tmp0;
3529:             sum3 -= v3[0] * tmp0;
3530:             sum4 -= v4[0] * tmp0;
3531:             sum5 -= v5[0] * tmp0;
3532:             v1++;
3533:             v2++;
3534:             v3++;
3535:             v4++;
3536:             v5++;
3537:           }
3538:           t[row]     = sum1;
3539:           t[row + 1] = sum2;
3540:           t[row + 2] = sum3;
3541:           t[row + 3] = sum4;
3542:           t[row + 4] = sum5;
3543:           sz         = ii[row + 1] - diag[row] - 5;
3544:           idx        = a->j + diag[row] + 5;
3545:           v1 += 5;
3546:           v2 += 5;
3547:           v3 += 5;
3548:           v4 += 5;
3549:           v5 += 5;
3550:           for (n = 0; n < sz - 1; n += 2) {
3551:             i1 = idx[0];
3552:             i2 = idx[1];
3553:             idx += 2;
3554:             tmp0 = x[i1];
3555:             tmp1 = x[i2];
3556:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3557:             v1 += 2;
3558:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3559:             v2 += 2;
3560:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3561:             v3 += 2;
3562:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3563:             v4 += 2;
3564:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3565:             v5 += 2;
3566:           }
3567:           if (n == sz - 1) {
3568:             tmp0 = x[*idx];
3569:             sum1 -= v1[0] * tmp0;
3570:             sum2 -= v2[0] * tmp0;
3571:             sum3 -= v3[0] * tmp0;
3572:             sum4 -= v4[0] * tmp0;
3573:             sum5 -= v5[0] * tmp0;
3574:           }
3575:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3576:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3577:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3578:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3579:           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3580:           break;
3581:         default:
3582:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3583:         }
3584:       }
3585:       xb = t;
3586:       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3587:     } else xb = b;

3589:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3590:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3591:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3592:         ibdiag -= sizes[i] * sizes[i];

3594:         /* set RHS */
3595:         if (xb == b) {
3596:           /* whole (old way) */
3597:           sz  = ii[row + 1] - ii[row];
3598:           idx = a->j + ii[row];
3599:           switch (sizes[i]) {
3600:           case 5:
3601:             v5 = a->a + ii[row - 4]; /* fall through */
3602:           case 4:
3603:             v4 = a->a + ii[row - 3]; /* fall through */
3604:           case 3:
3605:             v3 = a->a + ii[row - 2]; /* fall through */
3606:           case 2:
3607:             v2 = a->a + ii[row - 1]; /* fall through */
3608:           case 1:
3609:             v1 = a->a + ii[row];
3610:             break;
3611:           default:
3612:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3613:           }
3614:         } else {
3615:           /* upper, no diag */
3616:           sz  = ii[row + 1] - diag[row] - 1;
3617:           idx = a->j + diag[row] + 1;
3618:           switch (sizes[i]) {
3619:           case 5:
3620:             v5 = a->a + diag[row - 4] + 5; /* fall through */
3621:           case 4:
3622:             v4 = a->a + diag[row - 3] + 4; /* fall through */
3623:           case 3:
3624:             v3 = a->a + diag[row - 2] + 3; /* fall through */
3625:           case 2:
3626:             v2 = a->a + diag[row - 1] + 2; /* fall through */
3627:           case 1:
3628:             v1 = a->a + diag[row] + 1;
3629:           }
3630:         }
3631:         /* set sum */
3632:         switch (sizes[i]) {
3633:         case 5:
3634:           sum5 = xb[row - 4]; /* fall through */
3635:         case 4:
3636:           sum4 = xb[row - 3]; /* fall through */
3637:         case 3:
3638:           sum3 = xb[row - 2]; /* fall through */
3639:         case 2:
3640:           sum2 = xb[row - 1]; /* fall through */
3641:         case 1:
3642:           /* note that sum1 is associated with the last row */
3643:           sum1 = xb[row];
3644:         }
3645:         /* do sums */
3646:         for (n = 0; n < sz - 1; n += 2) {
3647:           i1 = idx[0];
3648:           i2 = idx[1];
3649:           idx += 2;
3650:           tmp0 = x[i1];
3651:           tmp1 = x[i2];
3652:           switch (sizes[i]) {
3653:           case 5:
3654:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3655:             v5 += 2; /* fall through */
3656:           case 4:
3657:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3658:             v4 += 2; /* fall through */
3659:           case 3:
3660:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3661:             v3 += 2; /* fall through */
3662:           case 2:
3663:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3664:             v2 += 2; /* fall through */
3665:           case 1:
3666:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3667:             v1 += 2;
3668:           }
3669:         }
3670:         /* ragged edge */
3671:         if (n == sz - 1) {
3672:           tmp0 = x[*idx];
3673:           switch (sizes[i]) {
3674:           case 5:
3675:             sum5 -= *v5 * tmp0; /* fall through */
3676:           case 4:
3677:             sum4 -= *v4 * tmp0; /* fall through */
3678:           case 3:
3679:             sum3 -= *v3 * tmp0; /* fall through */
3680:           case 2:
3681:             sum2 -= *v2 * tmp0; /* fall through */
3682:           case 1:
3683:             sum1 -= *v1 * tmp0;
3684:           }
3685:         }
3686:         /* update */
3687:         if (xb == b) {
3688:           /* whole (old way) w/ diag */
3689:           switch (sizes[i]) {
3690:           case 5:
3691:             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3692:             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3693:             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3694:             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3695:             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3696:             break;
3697:           case 4:
3698:             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3699:             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3700:             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3701:             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3702:             break;
3703:           case 3:
3704:             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3705:             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3706:             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3707:             break;
3708:           case 2:
3709:             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3710:             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3711:             break;
3712:           case 1:
3713:             x[row--] += sum1 * (*ibdiag);
3714:             break;
3715:           }
3716:         } else {
3717:           /* no diag so set =  */
3718:           switch (sizes[i]) {
3719:           case 5:
3720:             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3721:             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3722:             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3723:             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3724:             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3725:             break;
3726:           case 4:
3727:             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3728:             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3729:             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3730:             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3731:             break;
3732:           case 3:
3733:             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3734:             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3735:             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3736:             break;
3737:           case 2:
3738:             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3739:             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3740:             break;
3741:           case 1:
3742:             x[row--] = sum1 * (*ibdiag);
3743:             break;
3744:           }
3745:         }
3746:       }
3747:       if (xb == b) {
3748:         PetscCall(PetscLogFlops(2.0 * a->nz));
3749:       } else {
3750:         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3751:       }
3752:     }
3753:   }
3754:   if (flag & SOR_EISENSTAT) {
3755:     /*
3756:           Apply  (U + D)^-1  where D is now the block diagonal
3757:     */
3758:     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3759:     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3760:       ibdiag -= sizes[i] * sizes[i];
3761:       sz  = ii[row + 1] - diag[row] - 1;
3762:       v1  = a->a + diag[row] + 1;
3763:       idx = a->j + diag[row] + 1;
3764:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3765:       switch (sizes[i]) {
3766:       case 1:

3768:         sum1 = b[row];
3769:         for (n = 0; n < sz - 1; n += 2) {
3770:           i1 = idx[0];
3771:           i2 = idx[1];
3772:           idx += 2;
3773:           tmp0 = x[i1];
3774:           tmp1 = x[i2];
3775:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3776:           v1 += 2;
3777:         }

3779:         if (n == sz - 1) {
3780:           tmp0 = x[*idx];
3781:           sum1 -= *v1 * tmp0;
3782:         }
3783:         x[row] = sum1 * (*ibdiag);
3784:         row--;
3785:         break;

3787:       case 2:

3789:         sum1 = b[row];
3790:         sum2 = b[row - 1];
3791:         /* note that sum1 is associated with the second of the two rows */
3792:         v2 = a->a + diag[row - 1] + 2;
3793:         for (n = 0; n < sz - 1; n += 2) {
3794:           i1 = idx[0];
3795:           i2 = idx[1];
3796:           idx += 2;
3797:           tmp0 = x[i1];
3798:           tmp1 = x[i2];
3799:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3800:           v1 += 2;
3801:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3802:           v2 += 2;
3803:         }

3805:         if (n == sz - 1) {
3806:           tmp0 = x[*idx];
3807:           sum1 -= *v1 * tmp0;
3808:           sum2 -= *v2 * tmp0;
3809:         }
3810:         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3811:         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3812:         row -= 2;
3813:         break;
3814:       case 3:

3816:         sum1 = b[row];
3817:         sum2 = b[row - 1];
3818:         sum3 = b[row - 2];
3819:         v2   = a->a + diag[row - 1] + 2;
3820:         v3   = a->a + diag[row - 2] + 3;
3821:         for (n = 0; n < sz - 1; n += 2) {
3822:           i1 = idx[0];
3823:           i2 = idx[1];
3824:           idx += 2;
3825:           tmp0 = x[i1];
3826:           tmp1 = x[i2];
3827:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3828:           v1 += 2;
3829:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3830:           v2 += 2;
3831:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3832:           v3 += 2;
3833:         }

3835:         if (n == sz - 1) {
3836:           tmp0 = x[*idx];
3837:           sum1 -= *v1 * tmp0;
3838:           sum2 -= *v2 * tmp0;
3839:           sum3 -= *v3 * tmp0;
3840:         }
3841:         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3842:         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3843:         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3844:         row -= 3;
3845:         break;
3846:       case 4:

3848:         sum1 = b[row];
3849:         sum2 = b[row - 1];
3850:         sum3 = b[row - 2];
3851:         sum4 = b[row - 3];
3852:         v2   = a->a + diag[row - 1] + 2;
3853:         v3   = a->a + diag[row - 2] + 3;
3854:         v4   = a->a + diag[row - 3] + 4;
3855:         for (n = 0; n < sz - 1; n += 2) {
3856:           i1 = idx[0];
3857:           i2 = idx[1];
3858:           idx += 2;
3859:           tmp0 = x[i1];
3860:           tmp1 = x[i2];
3861:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3862:           v1 += 2;
3863:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3864:           v2 += 2;
3865:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3866:           v3 += 2;
3867:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3868:           v4 += 2;
3869:         }

3871:         if (n == sz - 1) {
3872:           tmp0 = x[*idx];
3873:           sum1 -= *v1 * tmp0;
3874:           sum2 -= *v2 * tmp0;
3875:           sum3 -= *v3 * tmp0;
3876:           sum4 -= *v4 * tmp0;
3877:         }
3878:         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3879:         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3880:         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3881:         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3882:         row -= 4;
3883:         break;
3884:       case 5:

3886:         sum1 = b[row];
3887:         sum2 = b[row - 1];
3888:         sum3 = b[row - 2];
3889:         sum4 = b[row - 3];
3890:         sum5 = b[row - 4];
3891:         v2   = a->a + diag[row - 1] + 2;
3892:         v3   = a->a + diag[row - 2] + 3;
3893:         v4   = a->a + diag[row - 3] + 4;
3894:         v5   = a->a + diag[row - 4] + 5;
3895:         for (n = 0; n < sz - 1; n += 2) {
3896:           i1 = idx[0];
3897:           i2 = idx[1];
3898:           idx += 2;
3899:           tmp0 = x[i1];
3900:           tmp1 = x[i2];
3901:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3902:           v1 += 2;
3903:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3904:           v2 += 2;
3905:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3906:           v3 += 2;
3907:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3908:           v4 += 2;
3909:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3910:           v5 += 2;
3911:         }

3913:         if (n == sz - 1) {
3914:           tmp0 = x[*idx];
3915:           sum1 -= *v1 * tmp0;
3916:           sum2 -= *v2 * tmp0;
3917:           sum3 -= *v3 * tmp0;
3918:           sum4 -= *v4 * tmp0;
3919:           sum5 -= *v5 * tmp0;
3920:         }
3921:         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3922:         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3923:         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3924:         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3925:         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3926:         row -= 5;
3927:         break;
3928:       default:
3929:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3930:       }
3931:     }
3932:     PetscCall(PetscLogFlops(a->nz));

3934:     /*
3935:            t = b - D x    where D is the block diagonal
3936:     */
3937:     cnt = 0;
3938:     for (i = 0, row = 0; i < m; i++) {
3939:       switch (sizes[i]) {
3940:       case 1:
3941:         t[row] = b[row] - bdiag[cnt++] * x[row];
3942:         row++;
3943:         break;
3944:       case 2:
3945:         x1         = x[row];
3946:         x2         = x[row + 1];
3947:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3948:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3949:         t[row]     = b[row] - tmp1;
3950:         t[row + 1] = b[row + 1] - tmp2;
3951:         row += 2;
3952:         cnt += 4;
3953:         break;
3954:       case 3:
3955:         x1         = x[row];
3956:         x2         = x[row + 1];
3957:         x3         = x[row + 2];
3958:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3959:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3960:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3961:         t[row]     = b[row] - tmp1;
3962:         t[row + 1] = b[row + 1] - tmp2;
3963:         t[row + 2] = b[row + 2] - tmp3;
3964:         row += 3;
3965:         cnt += 9;
3966:         break;
3967:       case 4:
3968:         x1         = x[row];
3969:         x2         = x[row + 1];
3970:         x3         = x[row + 2];
3971:         x4         = x[row + 3];
3972:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3973:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3974:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3975:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3976:         t[row]     = b[row] - tmp1;
3977:         t[row + 1] = b[row + 1] - tmp2;
3978:         t[row + 2] = b[row + 2] - tmp3;
3979:         t[row + 3] = b[row + 3] - tmp4;
3980:         row += 4;
3981:         cnt += 16;
3982:         break;
3983:       case 5:
3984:         x1         = x[row];
3985:         x2         = x[row + 1];
3986:         x3         = x[row + 2];
3987:         x4         = x[row + 3];
3988:         x5         = x[row + 4];
3989:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3990:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3991:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3992:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3993:         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3994:         t[row]     = b[row] - tmp1;
3995:         t[row + 1] = b[row + 1] - tmp2;
3996:         t[row + 2] = b[row + 2] - tmp3;
3997:         t[row + 3] = b[row + 3] - tmp4;
3998:         t[row + 4] = b[row + 4] - tmp5;
3999:         row += 5;
4000:         cnt += 25;
4001:         break;
4002:       default:
4003:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4004:       }
4005:     }
4006:     PetscCall(PetscLogFlops(m));

4008:     /*
4009:           Apply (L + D)^-1 where D is the block diagonal
4010:     */
4011:     for (i = 0, row = 0; i < m; i++) {
4012:       sz  = diag[row] - ii[row];
4013:       v1  = a->a + ii[row];
4014:       idx = a->j + ii[row];
4015:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4016:       switch (sizes[i]) {
4017:       case 1:

4019:         sum1 = t[row];
4020:         for (n = 0; n < sz - 1; n += 2) {
4021:           i1 = idx[0];
4022:           i2 = idx[1];
4023:           idx += 2;
4024:           tmp0 = t[i1];
4025:           tmp1 = t[i2];
4026:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4027:           v1 += 2;
4028:         }

4030:         if (n == sz - 1) {
4031:           tmp0 = t[*idx];
4032:           sum1 -= *v1 * tmp0;
4033:         }
4034:         x[row] += t[row] = sum1 * (*ibdiag++);
4035:         row++;
4036:         break;
4037:       case 2:
4038:         v2   = a->a + ii[row + 1];
4039:         sum1 = t[row];
4040:         sum2 = t[row + 1];
4041:         for (n = 0; n < sz - 1; n += 2) {
4042:           i1 = idx[0];
4043:           i2 = idx[1];
4044:           idx += 2;
4045:           tmp0 = t[i1];
4046:           tmp1 = t[i2];
4047:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4048:           v1 += 2;
4049:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4050:           v2 += 2;
4051:         }

4053:         if (n == sz - 1) {
4054:           tmp0 = t[*idx];
4055:           sum1 -= v1[0] * tmp0;
4056:           sum2 -= v2[0] * tmp0;
4057:         }
4058:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4059:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4060:         ibdiag += 4;
4061:         row += 2;
4062:         break;
4063:       case 3:
4064:         v2   = a->a + ii[row + 1];
4065:         v3   = a->a + ii[row + 2];
4066:         sum1 = t[row];
4067:         sum2 = t[row + 1];
4068:         sum3 = t[row + 2];
4069:         for (n = 0; n < sz - 1; n += 2) {
4070:           i1 = idx[0];
4071:           i2 = idx[1];
4072:           idx += 2;
4073:           tmp0 = t[i1];
4074:           tmp1 = t[i2];
4075:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4076:           v1 += 2;
4077:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4078:           v2 += 2;
4079:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4080:           v3 += 2;
4081:         }

4083:         if (n == sz - 1) {
4084:           tmp0 = t[*idx];
4085:           sum1 -= v1[0] * tmp0;
4086:           sum2 -= v2[0] * tmp0;
4087:           sum3 -= v3[0] * tmp0;
4088:         }
4089:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4090:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4091:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4092:         ibdiag += 9;
4093:         row += 3;
4094:         break;
4095:       case 4:
4096:         v2   = a->a + ii[row + 1];
4097:         v3   = a->a + ii[row + 2];
4098:         v4   = a->a + ii[row + 3];
4099:         sum1 = t[row];
4100:         sum2 = t[row + 1];
4101:         sum3 = t[row + 2];
4102:         sum4 = t[row + 3];
4103:         for (n = 0; n < sz - 1; n += 2) {
4104:           i1 = idx[0];
4105:           i2 = idx[1];
4106:           idx += 2;
4107:           tmp0 = t[i1];
4108:           tmp1 = t[i2];
4109:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4110:           v1 += 2;
4111:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4112:           v2 += 2;
4113:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4114:           v3 += 2;
4115:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4116:           v4 += 2;
4117:         }

4119:         if (n == sz - 1) {
4120:           tmp0 = t[*idx];
4121:           sum1 -= v1[0] * tmp0;
4122:           sum2 -= v2[0] * tmp0;
4123:           sum3 -= v3[0] * tmp0;
4124:           sum4 -= v4[0] * tmp0;
4125:         }
4126:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4127:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4128:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4129:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4130:         ibdiag += 16;
4131:         row += 4;
4132:         break;
4133:       case 5:
4134:         v2   = a->a + ii[row + 1];
4135:         v3   = a->a + ii[row + 2];
4136:         v4   = a->a + ii[row + 3];
4137:         v5   = a->a + ii[row + 4];
4138:         sum1 = t[row];
4139:         sum2 = t[row + 1];
4140:         sum3 = t[row + 2];
4141:         sum4 = t[row + 3];
4142:         sum5 = t[row + 4];
4143:         for (n = 0; n < sz - 1; n += 2) {
4144:           i1 = idx[0];
4145:           i2 = idx[1];
4146:           idx += 2;
4147:           tmp0 = t[i1];
4148:           tmp1 = t[i2];
4149:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4150:           v1 += 2;
4151:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4152:           v2 += 2;
4153:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4154:           v3 += 2;
4155:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4156:           v4 += 2;
4157:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4158:           v5 += 2;
4159:         }

4161:         if (n == sz - 1) {
4162:           tmp0 = t[*idx];
4163:           sum1 -= v1[0] * tmp0;
4164:           sum2 -= v2[0] * tmp0;
4165:           sum3 -= v3[0] * tmp0;
4166:           sum4 -= v4[0] * tmp0;
4167:           sum5 -= v5[0] * tmp0;
4168:         }
4169:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4170:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4171:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4172:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4173:         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4174:         ibdiag += 25;
4175:         row += 5;
4176:         break;
4177:       default:
4178:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4179:       }
4180:     }
4181:     PetscCall(PetscLogFlops(a->nz));
4182:   }
4183:   PetscCall(VecRestoreArray(xx, &x));
4184:   PetscCall(VecRestoreArrayRead(bb, &b));
4185:   PetscFunctionReturn(PETSC_SUCCESS);
4186: }

4188: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4189: {
4190:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4191:   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4192:   const MatScalar   *bdiag = a->inode.bdiag;
4193:   const PetscScalar *b;
4194:   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4195:   const PetscInt    *sizes = a->inode.size;

4197:   PetscFunctionBegin;
4198:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4199:   PetscCall(VecGetArray(xx, &x));
4200:   PetscCall(VecGetArrayRead(bb, &b));
4201:   cnt = 0;
4202:   for (i = 0, row = 0; i < m; i++) {
4203:     switch (sizes[i]) {
4204:     case 1:
4205:       x[row] = b[row] * bdiag[cnt++];
4206:       row++;
4207:       break;
4208:     case 2:
4209:       x1       = b[row];
4210:       x2       = b[row + 1];
4211:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4212:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4213:       x[row++] = tmp1;
4214:       x[row++] = tmp2;
4215:       cnt += 4;
4216:       break;
4217:     case 3:
4218:       x1       = b[row];
4219:       x2       = b[row + 1];
4220:       x3       = b[row + 2];
4221:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4222:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4223:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4224:       x[row++] = tmp1;
4225:       x[row++] = tmp2;
4226:       x[row++] = tmp3;
4227:       cnt += 9;
4228:       break;
4229:     case 4:
4230:       x1       = b[row];
4231:       x2       = b[row + 1];
4232:       x3       = b[row + 2];
4233:       x4       = b[row + 3];
4234:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4235:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4236:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4237:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4238:       x[row++] = tmp1;
4239:       x[row++] = tmp2;
4240:       x[row++] = tmp3;
4241:       x[row++] = tmp4;
4242:       cnt += 16;
4243:       break;
4244:     case 5:
4245:       x1       = b[row];
4246:       x2       = b[row + 1];
4247:       x3       = b[row + 2];
4248:       x4       = b[row + 3];
4249:       x5       = b[row + 4];
4250:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4251:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4252:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4253:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4254:       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4255:       x[row++] = tmp1;
4256:       x[row++] = tmp2;
4257:       x[row++] = tmp3;
4258:       x[row++] = tmp4;
4259:       x[row++] = tmp5;
4260:       cnt += 25;
4261:       break;
4262:     default:
4263:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4264:     }
4265:   }
4266:   PetscCall(PetscLogFlops(2.0 * cnt));
4267:   PetscCall(VecRestoreArray(xx, &x));
4268:   PetscCall(VecRestoreArrayRead(bb, &b));
4269:   PetscFunctionReturn(PETSC_SUCCESS);
4270: }

4272: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4273: {
4274:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4276:   PetscFunctionBegin;
4277:   a->inode.node_count       = 0;
4278:   a->inode.use              = PETSC_FALSE;
4279:   a->inode.checked          = PETSC_FALSE;
4280:   a->inode.mat_nonzerostate = -1;
4281:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4282:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4283:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4284:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4285:   A->ops->coloringpatch     = NULL;
4286:   A->ops->multdiagonalblock = NULL;
4287:   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4288:   PetscFunctionReturn(PETSC_SUCCESS);
4289: }

4291: /*
4292:     samestructure indicates that the matrix has not changed its nonzero structure so we
4293:     do not need to recompute the inodes
4294: */
4295: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4296: {
4297:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4298:   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4299:   PetscBool       flag;
4300:   const PetscInt *idx, *idy, *ii;

4302:   PetscFunctionBegin;
4303:   if (!a->inode.use) {
4304:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4305:     PetscCall(PetscFree(a->inode.size));
4306:     PetscFunctionReturn(PETSC_SUCCESS);
4307:   }
4308:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);

4310:   m = A->rmap->n;
4311:   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4312:   ns = a->inode.size;

4314:   i          = 0;
4315:   node_count = 0;
4316:   idx        = a->j;
4317:   ii         = a->i;
4318:   if (idx) {
4319:     while (i < m) {            /* For each row */
4320:       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4321:       /* Limits the number of elements in a node to 'a->inode.limit' */
4322:       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4323:         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4324:         if (nzy != nzx) break;
4325:         idy += nzx; /* Same nonzero pattern */
4326:         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4327:         if (!flag) break;
4328:       }
4329:       ns[node_count++] = blk_size;
4330:       idx += blk_size * nzx;
4331:       i = j;
4332:     }
4333:   }
4334:   /* If not enough inodes found,, do not use inode version of the routines */
4335:   if (!m || !idx || node_count > .8 * m) {
4336:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4337:     PetscCall(PetscFree(a->inode.size));
4338:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4339:   } else {
4340:     if (!A->factortype) {
4341:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4342:       if (A->rmap->n == A->cmap->n) {
4343:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4344:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4345:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4346:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4347:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4348:       }
4349:     } else {
4350:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4351:     }
4352:     a->inode.node_count = node_count;
4353:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4354:   }
4355:   a->inode.checked          = PETSC_TRUE;
4356:   a->inode.mat_nonzerostate = A->nonzerostate;
4357:   PetscFunctionReturn(PETSC_SUCCESS);
4358: }

4360: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4361: {
4362:   Mat         B = *C;
4363:   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4364:   PetscInt    m = A->rmap->n;

4366:   PetscFunctionBegin;
4367:   c->inode.use              = a->inode.use;
4368:   c->inode.limit            = a->inode.limit;
4369:   c->inode.max_limit        = a->inode.max_limit;
4370:   c->inode.checked          = PETSC_FALSE;
4371:   c->inode.size             = NULL;
4372:   c->inode.node_count       = 0;
4373:   c->inode.ibdiagvalid      = PETSC_FALSE;
4374:   c->inode.ibdiag           = NULL;
4375:   c->inode.bdiag            = NULL;
4376:   c->inode.mat_nonzerostate = -1;
4377:   if (a->inode.use) {
4378:     if (a->inode.checked && a->inode.size) {
4379:       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4380:       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));

4382:       c->inode.checked          = PETSC_TRUE;
4383:       c->inode.node_count       = a->inode.node_count;
4384:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4385:     }
4386:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4387:     if (!B->factortype) {
4388:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4389:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4390:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4391:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4392:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4393:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4394:     } else {
4395:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4396:     }
4397:   }
4398:   PetscFunctionReturn(PETSC_SUCCESS);
4399: }

4401: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4402: {
4403:   PetscInt        k;
4404:   const PetscInt *vi;

4406:   PetscFunctionBegin;
4407:   vi = aj + ai[row];
4408:   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4409:   vi        = aj + adiag[row];
4410:   cols[nzl] = vi[0];
4411:   vi        = aj + adiag[row + 1] + 1;
4412:   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4413:   PetscFunctionReturn(PETSC_SUCCESS);
4414: }
4415: /*
4416:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4417:    Modified from MatSeqAIJCheckInode().

4419:    Input Parameters:
4420: .  Mat A - ILU or LU matrix factor

4422: */
4423: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4424: {
4425:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4426:   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4427:   PetscInt       *cols1, *cols2, *ns;
4428:   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4429:   PetscBool       flag;

4431:   PetscFunctionBegin;
4432:   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4433:   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);

4435:   m = A->rmap->n;
4436:   if (a->inode.size) ns = a->inode.size;
4437:   else PetscCall(PetscMalloc1(m + 1, &ns));

4439:   i          = 0;
4440:   node_count = 0;
4441:   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4442:   while (i < m) {                       /* For each row */
4443:     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4444:     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4445:     nzx  = nzl1 + nzu1 + 1;
4446:     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));

4448:     /* Limits the number of elements in a node to 'a->inode.limit' */
4449:     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4450:       nzl2 = ai[j + 1] - ai[j];
4451:       nzu2 = adiag[j] - adiag[j + 1] - 1;
4452:       nzy  = nzl2 + nzu2 + 1;
4453:       if (nzy != nzx) break;
4454:       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4455:       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4456:       if (!flag) break;
4457:     }
4458:     ns[node_count++] = blk_size;
4459:     i                = j;
4460:   }
4461:   PetscCall(PetscFree2(cols1, cols2));
4462:   /* If not enough inodes found,, do not use inode version of the routines */
4463:   if (!m || node_count > .8 * m) {
4464:     PetscCall(PetscFree(ns));

4466:     a->inode.node_count = 0;
4467:     a->inode.size       = NULL;
4468:     a->inode.use        = PETSC_FALSE;

4470:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4471:   } else {
4472:     A->ops->mult              = NULL;
4473:     A->ops->sor               = NULL;
4474:     A->ops->multadd           = NULL;
4475:     A->ops->getrowij          = NULL;
4476:     A->ops->restorerowij      = NULL;
4477:     A->ops->getcolumnij       = NULL;
4478:     A->ops->restorecolumnij   = NULL;
4479:     A->ops->coloringpatch     = NULL;
4480:     A->ops->multdiagonalblock = NULL;
4481:     a->inode.node_count       = node_count;
4482:     a->inode.size             = ns;

4484:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4485:   }
4486:   a->inode.checked = PETSC_TRUE;
4487:   PetscFunctionReturn(PETSC_SUCCESS);
4488: }

4490: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4491: {
4492:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4494:   PetscFunctionBegin;
4495:   a->inode.ibdiagvalid = PETSC_FALSE;
4496:   PetscFunctionReturn(PETSC_SUCCESS);
4497: }

4499: /*
4500:      This is really ugly. if inodes are used this replaces the
4501:   permutations with ones that correspond to rows/cols of the matrix
4502:   rather than inode blocks
4503: */
4504: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4505: {
4506:   PetscFunctionBegin;
4507:   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4508:   PetscFunctionReturn(PETSC_SUCCESS);
4509: }

4511: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4512: {
4513:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4514:   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4515:   const PetscInt *ridx, *cidx;
4516:   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4517:   PetscInt        nslim_col, *ns_col;
4518:   IS              ris = *rperm, cis = *cperm;

4520:   PetscFunctionBegin;
4521:   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4522:   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */

4524:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4525:   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4526:   PetscCall(PetscMalloc2(m, &permr, n, &permc));

4528:   PetscCall(ISGetIndices(ris, &ridx));
4529:   PetscCall(ISGetIndices(cis, &cidx));

4531:   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4532:   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];

4534:   /* Construct the permutations for rows*/
4535:   for (i = 0, row = 0; i < nslim_row; ++i) {
4536:     indx      = ridx[i];
4537:     start_val = tns[indx];
4538:     end_val   = tns[indx + 1];
4539:     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4540:   }

4542:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4543:   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];

4545:   /* Construct permutations for columns */
4546:   for (i = 0, col = 0; i < nslim_col; ++i) {
4547:     indx      = cidx[i];
4548:     start_val = tns[indx];
4549:     end_val   = tns[indx + 1];
4550:     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4551:   }

4553:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4554:   PetscCall(ISSetPermutation(*rperm));
4555:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4556:   PetscCall(ISSetPermutation(*cperm));

4558:   PetscCall(ISRestoreIndices(ris, &ridx));
4559:   PetscCall(ISRestoreIndices(cis, &cidx));

4561:   PetscCall(PetscFree(ns_col));
4562:   PetscCall(PetscFree2(permr, permc));
4563:   PetscCall(ISDestroy(&cis));
4564:   PetscCall(ISDestroy(&ris));
4565:   PetscCall(PetscFree(tns));
4566:   PetscFunctionReturn(PETSC_SUCCESS);
4567: }

4569: /*@C
4570:   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes

4572:   Not Collective

4574:   Input Parameter:
4575: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`

4577:   Output Parameters:
4578: + node_count - no of inodes present in the matrix.
4579: . sizes      - an array of size `node_count`, with the sizes of each inode.
4580: - limit      - the max size used to generate the inodes.

4582:   Level: advanced

4584:   Note:
4585:   It should be called after the matrix is assembled.
4586:   The contents of the sizes[] array should not be changed.
4587:   `NULL` may be passed for information not needed

4589: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4590: @*/
4591: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4592: {
4593:   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);

4595:   PetscFunctionBegin;
4596:   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4597:   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4598:   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4599:   PetscFunctionReturn(PETSC_SUCCESS);
4600: }

4602: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4603: {
4604:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4606:   PetscFunctionBegin;
4607:   if (node_count) *node_count = a->inode.node_count;
4608:   if (sizes) *sizes = a->inode.size;
4609:   if (limit) *limit = a->inode.limit;
4610:   PetscFunctionReturn(PETSC_SUCCESS);
4611: }