Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
24: ;
25: for (; count + 1 < n; count++, i++)
26: ;
27: if (count < n) i++;
28: *size = i;
29: PetscFunctionReturn(PETSC_SUCCESS);
30: }
31: PetscCall(PetscMalloc1(n + 1, &ns_col));
33: /* Use the same row structure wherever feasible. */
34: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
36: /* if m < n; pad up the remainder with inode_limit */
37: for (; count + 1 < n; count++, i++) ns_col[i] = 1;
38: /* The last node is the odd ball. pad it up with the remaining rows; */
39: if (count < n) {
40: ns_col[i] = n - count;
41: i++;
42: } else if (count > n) {
43: /* Adjust for the over estimation */
44: ns_col[i - 1] += n - count;
45: }
46: *size = i;
47: *ns = ns_col;
48: PetscFunctionReturn(PETSC_SUCCESS);
49: }
51: /*
52: This builds symmetric version of nonzero structure,
53: */
54: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
55: {
56: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
57: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
58: PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
59: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
61: PetscFunctionBegin;
62: nslim_row = a->inode.node_count;
63: m = A->rmap->n;
64: n = A->cmap->n;
65: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
66: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
68: /* Use the row_inode as column_inode */
69: nslim_col = nslim_row;
70: ns_col = ns_row;
72: /* allocate space for reformatted inode structure */
73: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
74: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
76: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
77: nsz = ns_col[i1];
78: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
79: }
80: /* allocate space for row pointers */
81: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
82: *iia = ia;
83: PetscCall(PetscMalloc1(nslim_row + 1, &work));
85: /* determine the number of columns in each row */
86: ia[0] = oshift;
87: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
88: j = aj + ai[row] + ishift;
89: jmax = aj + ai[row + 1] + ishift;
90: if (j == jmax) continue; /* empty row */
91: col = *j++ + ishift;
92: i2 = tvc[col];
93: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94: ia[i1 + 1]++;
95: ia[i2 + 1]++;
96: i2++; /* Start col of next node */
97: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98: i2 = tvc[col];
99: }
100: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101: }
103: /* shift ia[i] to point to next row */
104: for (i1 = 1; i1 < nslim_row + 1; i1++) {
105: row = ia[i1 - 1];
106: ia[i1] += row;
107: work[i1 - 1] = row - oshift;
108: }
110: /* allocate space for column pointers */
111: nz = ia[nslim_row] + (!ishift);
112: PetscCall(PetscMalloc1(nz, &ja));
113: *jja = ja;
115: /* loop over lower triangular part putting into ja */
116: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
117: j = aj + ai[row] + ishift;
118: jmax = aj + ai[row + 1] + ishift;
119: if (j == jmax) continue; /* empty row */
120: col = *j++ + ishift;
121: i2 = tvc[col];
122: while (i2 < i1 && j < jmax) {
123: ja[work[i2]++] = i1 + oshift;
124: ja[work[i1]++] = i2 + oshift;
125: ++i2;
126: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
127: i2 = tvc[col];
128: }
129: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
130: }
131: PetscCall(PetscFree(work));
132: PetscCall(PetscFree2(tns, tvc));
133: PetscFunctionReturn(PETSC_SUCCESS);
134: }
136: /*
137: This builds nonsymmetric version of nonzero structure,
138: */
139: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
140: {
141: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
142: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
143: PetscInt *tns, *tvc, nsz, i1, i2;
144: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
146: PetscFunctionBegin;
147: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
148: nslim_row = a->inode.node_count;
149: n = A->cmap->n;
151: /* Create The column_inode for this matrix */
152: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154: /* allocate space for reformatted column_inode structure */
155: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
156: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
158: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
159: nsz = ns_col[i1];
160: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
161: }
162: /* allocate space for row pointers */
163: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
164: *iia = ia;
165: PetscCall(PetscMalloc1(nslim_row + 1, &work));
167: /* determine the number of columns in each row */
168: ia[0] = oshift;
169: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
170: j = aj + ai[row] + ishift;
171: nz = ai[row + 1] - ai[row];
172: if (!nz) continue; /* empty row */
173: col = *j++ + ishift;
174: i2 = tvc[col];
175: while (nz-- > 0) { /* off-diagonal elements */
176: ia[i1 + 1]++;
177: i2++; /* Start col of next node */
178: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
179: if (nz > 0) i2 = tvc[col];
180: }
181: }
183: /* shift ia[i] to point to next row */
184: for (i1 = 1; i1 < nslim_row + 1; i1++) {
185: row = ia[i1 - 1];
186: ia[i1] += row;
187: work[i1 - 1] = row - oshift;
188: }
190: /* allocate space for column pointers */
191: nz = ia[nslim_row] + (!ishift);
192: PetscCall(PetscMalloc1(nz, &ja));
193: *jja = ja;
195: /* loop over matrix putting into ja */
196: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
197: j = aj + ai[row] + ishift;
198: nz = ai[row + 1] - ai[row];
199: if (!nz) continue; /* empty row */
200: col = *j++ + ishift;
201: i2 = tvc[col];
202: while (nz-- > 0) {
203: ja[work[i1]++] = i2 + oshift;
204: ++i2;
205: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
206: if (nz > 0) i2 = tvc[col];
207: }
208: }
209: PetscCall(PetscFree(ns_col));
210: PetscCall(PetscFree(work));
211: PetscCall(PetscFree2(tns, tvc));
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
216: {
217: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
219: PetscFunctionBegin;
220: if (n) *n = a->inode.node_count;
221: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
222: if (!blockcompressed) {
223: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
224: } else if (symmetric) {
225: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
226: } else {
227: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
228: }
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
233: {
234: PetscFunctionBegin;
235: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
237: if (!blockcompressed) {
238: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
239: } else {
240: PetscCall(PetscFree(*ia));
241: PetscCall(PetscFree(*ja));
242: }
243: PetscFunctionReturn(PETSC_SUCCESS);
244: }
246: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247: {
248: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
250: PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
252: PetscFunctionBegin;
253: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
254: nslim_row = a->inode.node_count;
255: n = A->cmap->n;
257: /* Create The column_inode for this matrix */
258: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
260: /* allocate space for reformatted column_inode structure */
261: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
262: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
264: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
265: nsz = ns_col[i1];
266: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
267: }
268: /* allocate space for column pointers */
269: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
270: *iia = ia;
271: PetscCall(PetscMalloc1(nslim_col + 1, &work));
273: /* determine the number of columns in each row */
274: ia[0] = oshift;
275: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
276: j = aj + ai[row] + ishift;
277: col = *j++ + ishift;
278: i2 = tvc[col];
279: nz = ai[row + 1] - ai[row];
280: while (nz-- > 0) { /* off-diagonal elements */
281: /* ia[i1+1]++; */
282: ia[i2 + 1]++;
283: i2++;
284: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285: if (nz > 0) i2 = tvc[col];
286: }
287: }
289: /* shift ia[i] to point to next col */
290: for (i1 = 1; i1 < nslim_col + 1; i1++) {
291: col = ia[i1 - 1];
292: ia[i1] += col;
293: work[i1 - 1] = col - oshift;
294: }
296: /* allocate space for column pointers */
297: nz = ia[nslim_col] + (!ishift);
298: PetscCall(PetscMalloc1(nz, &ja));
299: *jja = ja;
301: /* loop over matrix putting into ja */
302: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
303: j = aj + ai[row] + ishift;
304: col = *j++ + ishift;
305: i2 = tvc[col];
306: nz = ai[row + 1] - ai[row];
307: while (nz-- > 0) {
308: /* ja[work[i1]++] = i2 + oshift; */
309: ja[work[i2]++] = i1 + oshift;
310: i2++;
311: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312: if (nz > 0) i2 = tvc[col];
313: }
314: }
315: PetscCall(PetscFree(ns_col));
316: PetscCall(PetscFree(work));
317: PetscCall(PetscFree2(tns, tvc));
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }
321: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322: {
323: PetscFunctionBegin;
324: PetscCall(MatCreateColInode_Private(A, n, NULL));
325: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
327: if (!blockcompressed) {
328: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
329: } else if (symmetric) {
330: /* Since the indices are symmetric it doesn't matter */
331: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
332: } else {
333: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
334: }
335: PetscFunctionReturn(PETSC_SUCCESS);
336: }
338: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339: {
340: PetscFunctionBegin;
341: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
342: if (!blockcompressed) {
343: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
344: } else {
345: PetscCall(PetscFree(*ia));
346: PetscCall(PetscFree(*ja));
347: }
348: PetscFunctionReturn(PETSC_SUCCESS);
349: }
351: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
352: {
353: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
354: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
355: PetscScalar *y;
356: const PetscScalar *x;
357: const MatScalar *v1, *v2, *v3, *v4, *v5;
358: PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
359: const PetscInt *idx, *ns, *ii;
361: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
362: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
363: #endif
365: PetscFunctionBegin;
366: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
367: node_max = a->inode.node_count;
368: ns = a->inode.size; /* Node Size array */
369: PetscCall(VecGetArrayRead(xx, &x));
370: PetscCall(VecGetArray(yy, &y));
371: idx = a->j;
372: v1 = a->a;
373: ii = a->i;
375: for (i = 0, row = 0; i < node_max; ++i) {
376: nsz = ns[i];
377: n = ii[1] - ii[0];
378: nonzerorow += (n > 0) * nsz;
379: ii += nsz;
380: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
381: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
382: sz = n; /* No of non zeros in this row */
383: /* Switch on the size of Node */
384: switch (nsz) { /* Each loop in 'case' is unrolled */
385: case 1:
386: sum1 = 0.;
388: for (n = 0; n < sz - 1; n += 2) {
389: i1 = idx[0]; /* The instructions are ordered to */
390: i2 = idx[1]; /* make the compiler's job easy */
391: idx += 2;
392: tmp0 = x[i1];
393: tmp1 = x[i2];
394: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
395: v1 += 2;
396: }
398: if (n == sz - 1) { /* Take care of the last nonzero */
399: tmp0 = x[*idx++];
400: sum1 += *v1++ * tmp0;
401: }
402: y[row++] = sum1;
403: break;
404: case 2:
405: sum1 = 0.;
406: sum2 = 0.;
407: v2 = v1 + n;
409: for (n = 0; n < sz - 1; n += 2) {
410: i1 = idx[0];
411: i2 = idx[1];
412: idx += 2;
413: tmp0 = x[i1];
414: tmp1 = x[i2];
415: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
416: v1 += 2;
417: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
418: v2 += 2;
419: }
420: if (n == sz - 1) {
421: tmp0 = x[*idx++];
422: sum1 += *v1++ * tmp0;
423: sum2 += *v2++ * tmp0;
424: }
425: y[row++] = sum1;
426: y[row++] = sum2;
427: v1 = v2; /* Since the next block to be processed starts there*/
428: idx += sz;
429: break;
430: case 3:
431: sum1 = 0.;
432: sum2 = 0.;
433: sum3 = 0.;
434: v2 = v1 + n;
435: v3 = v2 + n;
437: for (n = 0; n < sz - 1; n += 2) {
438: i1 = idx[0];
439: i2 = idx[1];
440: idx += 2;
441: tmp0 = x[i1];
442: tmp1 = x[i2];
443: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
444: v1 += 2;
445: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
446: v2 += 2;
447: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
448: v3 += 2;
449: }
450: if (n == sz - 1) {
451: tmp0 = x[*idx++];
452: sum1 += *v1++ * tmp0;
453: sum2 += *v2++ * tmp0;
454: sum3 += *v3++ * tmp0;
455: }
456: y[row++] = sum1;
457: y[row++] = sum2;
458: y[row++] = sum3;
459: v1 = v3; /* Since the next block to be processed starts there*/
460: idx += 2 * sz;
461: break;
462: case 4:
463: sum1 = 0.;
464: sum2 = 0.;
465: sum3 = 0.;
466: sum4 = 0.;
467: v2 = v1 + n;
468: v3 = v2 + n;
469: v4 = v3 + n;
471: for (n = 0; n < sz - 1; n += 2) {
472: i1 = idx[0];
473: i2 = idx[1];
474: idx += 2;
475: tmp0 = x[i1];
476: tmp1 = x[i2];
477: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
478: v1 += 2;
479: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
480: v2 += 2;
481: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
482: v3 += 2;
483: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
484: v4 += 2;
485: }
486: if (n == sz - 1) {
487: tmp0 = x[*idx++];
488: sum1 += *v1++ * tmp0;
489: sum2 += *v2++ * tmp0;
490: sum3 += *v3++ * tmp0;
491: sum4 += *v4++ * tmp0;
492: }
493: y[row++] = sum1;
494: y[row++] = sum2;
495: y[row++] = sum3;
496: y[row++] = sum4;
497: v1 = v4; /* Since the next block to be processed starts there*/
498: idx += 3 * sz;
499: break;
500: case 5:
501: sum1 = 0.;
502: sum2 = 0.;
503: sum3 = 0.;
504: sum4 = 0.;
505: sum5 = 0.;
506: v2 = v1 + n;
507: v3 = v2 + n;
508: v4 = v3 + n;
509: v5 = v4 + n;
511: for (n = 0; n < sz - 1; n += 2) {
512: i1 = idx[0];
513: i2 = idx[1];
514: idx += 2;
515: tmp0 = x[i1];
516: tmp1 = x[i2];
517: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
518: v1 += 2;
519: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
520: v2 += 2;
521: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
522: v3 += 2;
523: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
524: v4 += 2;
525: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
526: v5 += 2;
527: }
528: if (n == sz - 1) {
529: tmp0 = x[*idx++];
530: sum1 += *v1++ * tmp0;
531: sum2 += *v2++ * tmp0;
532: sum3 += *v3++ * tmp0;
533: sum4 += *v4++ * tmp0;
534: sum5 += *v5++ * tmp0;
535: }
536: y[row++] = sum1;
537: y[row++] = sum2;
538: y[row++] = sum3;
539: y[row++] = sum4;
540: y[row++] = sum5;
541: v1 = v5; /* Since the next block to be processed starts there */
542: idx += 4 * sz;
543: break;
544: default:
545: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
546: }
547: }
548: PetscCall(VecRestoreArrayRead(xx, &x));
549: PetscCall(VecRestoreArray(yy, &y));
550: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
551: PetscFunctionReturn(PETSC_SUCCESS);
552: }
554: /* Almost same code as the MatMult_SeqAIJ_Inode() */
555: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
556: {
557: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
558: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
559: const MatScalar *v1, *v2, *v3, *v4, *v5;
560: const PetscScalar *x;
561: PetscScalar *y, *z, *zt;
562: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
563: const PetscInt *idx, *ns, *ii;
565: PetscFunctionBegin;
566: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
567: node_max = a->inode.node_count;
568: ns = a->inode.size; /* Node Size array */
570: PetscCall(VecGetArrayRead(xx, &x));
571: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
572: zt = z;
574: idx = a->j;
575: v1 = a->a;
576: ii = a->i;
578: for (i = 0, row = 0; i < node_max; ++i) {
579: nsz = ns[i];
580: n = ii[1] - ii[0];
581: ii += nsz;
582: sz = n; /* No of non zeros in this row */
583: /* Switch on the size of Node */
584: switch (nsz) { /* Each loop in 'case' is unrolled */
585: case 1:
586: sum1 = *zt++;
588: for (n = 0; n < sz - 1; n += 2) {
589: i1 = idx[0]; /* The instructions are ordered to */
590: i2 = idx[1]; /* make the compiler's job easy */
591: idx += 2;
592: tmp0 = x[i1];
593: tmp1 = x[i2];
594: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
595: v1 += 2;
596: }
598: if (n == sz - 1) { /* Take care of the last nonzero */
599: tmp0 = x[*idx++];
600: sum1 += *v1++ * tmp0;
601: }
602: y[row++] = sum1;
603: break;
604: case 2:
605: sum1 = *zt++;
606: sum2 = *zt++;
607: v2 = v1 + n;
609: for (n = 0; n < sz - 1; n += 2) {
610: i1 = idx[0];
611: i2 = idx[1];
612: idx += 2;
613: tmp0 = x[i1];
614: tmp1 = x[i2];
615: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
616: v1 += 2;
617: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
618: v2 += 2;
619: }
620: if (n == sz - 1) {
621: tmp0 = x[*idx++];
622: sum1 += *v1++ * tmp0;
623: sum2 += *v2++ * tmp0;
624: }
625: y[row++] = sum1;
626: y[row++] = sum2;
627: v1 = v2; /* Since the next block to be processed starts there*/
628: idx += sz;
629: break;
630: case 3:
631: sum1 = *zt++;
632: sum2 = *zt++;
633: sum3 = *zt++;
634: v2 = v1 + n;
635: v3 = v2 + n;
637: for (n = 0; n < sz - 1; n += 2) {
638: i1 = idx[0];
639: i2 = idx[1];
640: idx += 2;
641: tmp0 = x[i1];
642: tmp1 = x[i2];
643: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
644: v1 += 2;
645: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
646: v2 += 2;
647: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
648: v3 += 2;
649: }
650: if (n == sz - 1) {
651: tmp0 = x[*idx++];
652: sum1 += *v1++ * tmp0;
653: sum2 += *v2++ * tmp0;
654: sum3 += *v3++ * tmp0;
655: }
656: y[row++] = sum1;
657: y[row++] = sum2;
658: y[row++] = sum3;
659: v1 = v3; /* Since the next block to be processed starts there*/
660: idx += 2 * sz;
661: break;
662: case 4:
663: sum1 = *zt++;
664: sum2 = *zt++;
665: sum3 = *zt++;
666: sum4 = *zt++;
667: v2 = v1 + n;
668: v3 = v2 + n;
669: v4 = v3 + n;
671: for (n = 0; n < sz - 1; n += 2) {
672: i1 = idx[0];
673: i2 = idx[1];
674: idx += 2;
675: tmp0 = x[i1];
676: tmp1 = x[i2];
677: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
678: v1 += 2;
679: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
680: v2 += 2;
681: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
682: v3 += 2;
683: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
684: v4 += 2;
685: }
686: if (n == sz - 1) {
687: tmp0 = x[*idx++];
688: sum1 += *v1++ * tmp0;
689: sum2 += *v2++ * tmp0;
690: sum3 += *v3++ * tmp0;
691: sum4 += *v4++ * tmp0;
692: }
693: y[row++] = sum1;
694: y[row++] = sum2;
695: y[row++] = sum3;
696: y[row++] = sum4;
697: v1 = v4; /* Since the next block to be processed starts there*/
698: idx += 3 * sz;
699: break;
700: case 5:
701: sum1 = *zt++;
702: sum2 = *zt++;
703: sum3 = *zt++;
704: sum4 = *zt++;
705: sum5 = *zt++;
706: v2 = v1 + n;
707: v3 = v2 + n;
708: v4 = v3 + n;
709: v5 = v4 + n;
711: for (n = 0; n < sz - 1; n += 2) {
712: i1 = idx[0];
713: i2 = idx[1];
714: idx += 2;
715: tmp0 = x[i1];
716: tmp1 = x[i2];
717: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
718: v1 += 2;
719: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
720: v2 += 2;
721: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
722: v3 += 2;
723: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
724: v4 += 2;
725: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
726: v5 += 2;
727: }
728: if (n == sz - 1) {
729: tmp0 = x[*idx++];
730: sum1 += *v1++ * tmp0;
731: sum2 += *v2++ * tmp0;
732: sum3 += *v3++ * tmp0;
733: sum4 += *v4++ * tmp0;
734: sum5 += *v5++ * tmp0;
735: }
736: y[row++] = sum1;
737: y[row++] = sum2;
738: y[row++] = sum3;
739: y[row++] = sum4;
740: y[row++] = sum5;
741: v1 = v5; /* Since the next block to be processed starts there */
742: idx += 4 * sz;
743: break;
744: default:
745: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
746: }
747: }
748: PetscCall(VecRestoreArrayRead(xx, &x));
749: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
750: PetscCall(PetscLogFlops(2.0 * a->nz));
751: PetscFunctionReturn(PETSC_SUCCESS);
752: }
754: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
755: {
756: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
757: IS iscol = a->col, isrow = a->row;
758: const PetscInt *r, *c, *rout, *cout;
759: PetscInt i, j, n = A->rmap->n, nz;
760: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
761: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
762: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
763: PetscScalar sum1, sum2, sum3, sum4, sum5;
764: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
765: const PetscScalar *b;
767: PetscFunctionBegin;
768: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
769: node_max = a->inode.node_count;
770: ns = a->inode.size; /* Node Size array */
772: PetscCall(VecGetArrayRead(bb, &b));
773: PetscCall(VecGetArrayWrite(xx, &x));
774: tmp = a->solve_work;
776: PetscCall(ISGetIndices(isrow, &rout));
777: r = rout;
778: PetscCall(ISGetIndices(iscol, &cout));
779: c = cout + (n - 1);
781: /* forward solve the lower triangular */
782: tmps = tmp;
783: aa = a_a;
784: aj = a_j;
785: ad = a->diag;
787: for (i = 0, row = 0; i < node_max; ++i) {
788: nsz = ns[i];
789: aii = ai[row];
790: v1 = aa + aii;
791: vi = aj + aii;
792: nz = ad[row] - aii;
793: if (i < node_max - 1) {
794: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
795: * but our indexing to determine its size could. */
796: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
797: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
798: PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
799: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
800: }
802: switch (nsz) { /* Each loop in 'case' is unrolled */
803: case 1:
804: sum1 = b[*r++];
805: for (j = 0; j < nz - 1; j += 2) {
806: i0 = vi[0];
807: i1 = vi[1];
808: vi += 2;
809: tmp0 = tmps[i0];
810: tmp1 = tmps[i1];
811: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
812: v1 += 2;
813: }
814: if (j == nz - 1) {
815: tmp0 = tmps[*vi++];
816: sum1 -= *v1++ * tmp0;
817: }
818: tmp[row++] = sum1;
819: break;
820: case 2:
821: sum1 = b[*r++];
822: sum2 = b[*r++];
823: v2 = aa + ai[row + 1];
825: for (j = 0; j < nz - 1; j += 2) {
826: i0 = vi[0];
827: i1 = vi[1];
828: vi += 2;
829: tmp0 = tmps[i0];
830: tmp1 = tmps[i1];
831: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
832: v1 += 2;
833: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
834: v2 += 2;
835: }
836: if (j == nz - 1) {
837: tmp0 = tmps[*vi++];
838: sum1 -= *v1++ * tmp0;
839: sum2 -= *v2++ * tmp0;
840: }
841: sum2 -= *v2++ * sum1;
842: tmp[row++] = sum1;
843: tmp[row++] = sum2;
844: break;
845: case 3:
846: sum1 = b[*r++];
847: sum2 = b[*r++];
848: sum3 = b[*r++];
849: v2 = aa + ai[row + 1];
850: v3 = aa + ai[row + 2];
852: for (j = 0; j < nz - 1; j += 2) {
853: i0 = vi[0];
854: i1 = vi[1];
855: vi += 2;
856: tmp0 = tmps[i0];
857: tmp1 = tmps[i1];
858: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
859: v1 += 2;
860: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
861: v2 += 2;
862: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
863: v3 += 2;
864: }
865: if (j == nz - 1) {
866: tmp0 = tmps[*vi++];
867: sum1 -= *v1++ * tmp0;
868: sum2 -= *v2++ * tmp0;
869: sum3 -= *v3++ * tmp0;
870: }
871: sum2 -= *v2++ * sum1;
872: sum3 -= *v3++ * sum1;
873: sum3 -= *v3++ * sum2;
875: tmp[row++] = sum1;
876: tmp[row++] = sum2;
877: tmp[row++] = sum3;
878: break;
880: case 4:
881: sum1 = b[*r++];
882: sum2 = b[*r++];
883: sum3 = b[*r++];
884: sum4 = b[*r++];
885: v2 = aa + ai[row + 1];
886: v3 = aa + ai[row + 2];
887: v4 = aa + ai[row + 3];
889: for (j = 0; j < nz - 1; j += 2) {
890: i0 = vi[0];
891: i1 = vi[1];
892: vi += 2;
893: tmp0 = tmps[i0];
894: tmp1 = tmps[i1];
895: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
896: v1 += 2;
897: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
898: v2 += 2;
899: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
900: v3 += 2;
901: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
902: v4 += 2;
903: }
904: if (j == nz - 1) {
905: tmp0 = tmps[*vi++];
906: sum1 -= *v1++ * tmp0;
907: sum2 -= *v2++ * tmp0;
908: sum3 -= *v3++ * tmp0;
909: sum4 -= *v4++ * tmp0;
910: }
911: sum2 -= *v2++ * sum1;
912: sum3 -= *v3++ * sum1;
913: sum4 -= *v4++ * sum1;
914: sum3 -= *v3++ * sum2;
915: sum4 -= *v4++ * sum2;
916: sum4 -= *v4++ * sum3;
918: tmp[row++] = sum1;
919: tmp[row++] = sum2;
920: tmp[row++] = sum3;
921: tmp[row++] = sum4;
922: break;
923: case 5:
924: sum1 = b[*r++];
925: sum2 = b[*r++];
926: sum3 = b[*r++];
927: sum4 = b[*r++];
928: sum5 = b[*r++];
929: v2 = aa + ai[row + 1];
930: v3 = aa + ai[row + 2];
931: v4 = aa + ai[row + 3];
932: v5 = aa + ai[row + 4];
934: for (j = 0; j < nz - 1; j += 2) {
935: i0 = vi[0];
936: i1 = vi[1];
937: vi += 2;
938: tmp0 = tmps[i0];
939: tmp1 = tmps[i1];
940: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
941: v1 += 2;
942: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
943: v2 += 2;
944: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
945: v3 += 2;
946: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
947: v4 += 2;
948: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
949: v5 += 2;
950: }
951: if (j == nz - 1) {
952: tmp0 = tmps[*vi++];
953: sum1 -= *v1++ * tmp0;
954: sum2 -= *v2++ * tmp0;
955: sum3 -= *v3++ * tmp0;
956: sum4 -= *v4++ * tmp0;
957: sum5 -= *v5++ * tmp0;
958: }
960: sum2 -= *v2++ * sum1;
961: sum3 -= *v3++ * sum1;
962: sum4 -= *v4++ * sum1;
963: sum5 -= *v5++ * sum1;
964: sum3 -= *v3++ * sum2;
965: sum4 -= *v4++ * sum2;
966: sum5 -= *v5++ * sum2;
967: sum4 -= *v4++ * sum3;
968: sum5 -= *v5++ * sum3;
969: sum5 -= *v5++ * sum4;
971: tmp[row++] = sum1;
972: tmp[row++] = sum2;
973: tmp[row++] = sum3;
974: tmp[row++] = sum4;
975: tmp[row++] = sum5;
976: break;
977: default:
978: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
979: }
980: }
981: /* backward solve the upper triangular */
982: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
983: nsz = ns[i];
984: aii = ai[row + 1] - 1;
985: v1 = aa + aii;
986: vi = aj + aii;
987: nz = aii - ad[row];
988: switch (nsz) { /* Each loop in 'case' is unrolled */
989: case 1:
990: sum1 = tmp[row];
992: for (j = nz; j > 1; j -= 2) {
993: vi -= 2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000: }
1001: if (j == 1) {
1002: tmp0 = tmps[*vi--];
1003: sum1 -= *v1-- * tmp0;
1004: }
1005: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1006: row--;
1007: break;
1008: case 2:
1009: sum1 = tmp[row];
1010: sum2 = tmp[row - 1];
1011: v2 = aa + ai[row] - 1;
1012: for (j = nz; j > 1; j -= 2) {
1013: vi -= 2;
1014: i0 = vi[2];
1015: i1 = vi[1];
1016: tmp0 = tmps[i0];
1017: tmp1 = tmps[i1];
1018: v1 -= 2;
1019: v2 -= 2;
1020: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022: }
1023: if (j == 1) {
1024: tmp0 = tmps[*vi--];
1025: sum1 -= *v1-- * tmp0;
1026: sum2 -= *v2-- * tmp0;
1027: }
1029: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1030: row--;
1031: sum2 -= *v2-- * tmp0;
1032: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1033: row--;
1034: break;
1035: case 3:
1036: sum1 = tmp[row];
1037: sum2 = tmp[row - 1];
1038: sum3 = tmp[row - 2];
1039: v2 = aa + ai[row] - 1;
1040: v3 = aa + ai[row - 1] - 1;
1041: for (j = nz; j > 1; j -= 2) {
1042: vi -= 2;
1043: i0 = vi[2];
1044: i1 = vi[1];
1045: tmp0 = tmps[i0];
1046: tmp1 = tmps[i1];
1047: v1 -= 2;
1048: v2 -= 2;
1049: v3 -= 2;
1050: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1051: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1052: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1053: }
1054: if (j == 1) {
1055: tmp0 = tmps[*vi--];
1056: sum1 -= *v1-- * tmp0;
1057: sum2 -= *v2-- * tmp0;
1058: sum3 -= *v3-- * tmp0;
1059: }
1060: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1061: row--;
1062: sum2 -= *v2-- * tmp0;
1063: sum3 -= *v3-- * tmp0;
1064: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1065: row--;
1066: sum3 -= *v3-- * tmp0;
1067: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1068: row--;
1070: break;
1071: case 4:
1072: sum1 = tmp[row];
1073: sum2 = tmp[row - 1];
1074: sum3 = tmp[row - 2];
1075: sum4 = tmp[row - 3];
1076: v2 = aa + ai[row] - 1;
1077: v3 = aa + ai[row - 1] - 1;
1078: v4 = aa + ai[row - 2] - 1;
1080: for (j = nz; j > 1; j -= 2) {
1081: vi -= 2;
1082: i0 = vi[2];
1083: i1 = vi[1];
1084: tmp0 = tmps[i0];
1085: tmp1 = tmps[i1];
1086: v1 -= 2;
1087: v2 -= 2;
1088: v3 -= 2;
1089: v4 -= 2;
1090: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1091: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1092: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1093: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1094: }
1095: if (j == 1) {
1096: tmp0 = tmps[*vi--];
1097: sum1 -= *v1-- * tmp0;
1098: sum2 -= *v2-- * tmp0;
1099: sum3 -= *v3-- * tmp0;
1100: sum4 -= *v4-- * tmp0;
1101: }
1103: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1104: row--;
1105: sum2 -= *v2-- * tmp0;
1106: sum3 -= *v3-- * tmp0;
1107: sum4 -= *v4-- * tmp0;
1108: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1109: row--;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1113: row--;
1114: sum4 -= *v4-- * tmp0;
1115: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1116: row--;
1117: break;
1118: case 5:
1119: sum1 = tmp[row];
1120: sum2 = tmp[row - 1];
1121: sum3 = tmp[row - 2];
1122: sum4 = tmp[row - 3];
1123: sum5 = tmp[row - 4];
1124: v2 = aa + ai[row] - 1;
1125: v3 = aa + ai[row - 1] - 1;
1126: v4 = aa + ai[row - 2] - 1;
1127: v5 = aa + ai[row - 3] - 1;
1128: for (j = nz; j > 1; j -= 2) {
1129: vi -= 2;
1130: i0 = vi[2];
1131: i1 = vi[1];
1132: tmp0 = tmps[i0];
1133: tmp1 = tmps[i1];
1134: v1 -= 2;
1135: v2 -= 2;
1136: v3 -= 2;
1137: v4 -= 2;
1138: v5 -= 2;
1139: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1140: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1141: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1142: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1143: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1144: }
1145: if (j == 1) {
1146: tmp0 = tmps[*vi--];
1147: sum1 -= *v1-- * tmp0;
1148: sum2 -= *v2-- * tmp0;
1149: sum3 -= *v3-- * tmp0;
1150: sum4 -= *v4-- * tmp0;
1151: sum5 -= *v5-- * tmp0;
1152: }
1154: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1155: row--;
1156: sum2 -= *v2-- * tmp0;
1157: sum3 -= *v3-- * tmp0;
1158: sum4 -= *v4-- * tmp0;
1159: sum5 -= *v5-- * tmp0;
1160: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1161: row--;
1162: sum3 -= *v3-- * tmp0;
1163: sum4 -= *v4-- * tmp0;
1164: sum5 -= *v5-- * tmp0;
1165: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1166: row--;
1167: sum4 -= *v4-- * tmp0;
1168: sum5 -= *v5-- * tmp0;
1169: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1170: row--;
1171: sum5 -= *v5-- * tmp0;
1172: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1173: row--;
1174: break;
1175: default:
1176: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1177: }
1178: }
1179: PetscCall(ISRestoreIndices(isrow, &rout));
1180: PetscCall(ISRestoreIndices(iscol, &cout));
1181: PetscCall(VecRestoreArrayRead(bb, &b));
1182: PetscCall(VecRestoreArrayWrite(xx, &x));
1183: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1184: PetscFunctionReturn(PETSC_SUCCESS);
1185: }
1187: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1188: {
1189: Mat C = B;
1190: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1191: IS isrow = b->row, isicol = b->icol;
1192: const PetscInt *r, *ic, *ics;
1193: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1194: PetscInt i, j, k, nz, nzL, row, *pj;
1195: const PetscInt *ajtmp, *bjtmp;
1196: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1197: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1198: FactorShiftCtx sctx;
1199: const PetscInt *ddiag;
1200: PetscReal rs;
1201: MatScalar d;
1202: PetscInt inod, nodesz, node_max, col;
1203: const PetscInt *ns;
1204: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1206: PetscFunctionBegin;
1207: /* MatPivotSetUp(): initialize shift context sctx */
1208: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1210: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1211: ddiag = a->diag;
1212: sctx.shift_top = info->zeropivot;
1213: for (i = 0; i < n; i++) {
1214: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1215: d = (aa)[ddiag[i]];
1216: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1217: v = aa + ai[i];
1218: nz = ai[i + 1] - ai[i];
1219: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1220: if (rs > sctx.shift_top) sctx.shift_top = rs;
1221: }
1222: sctx.shift_top *= 1.1;
1223: sctx.nshift_max = 5;
1224: sctx.shift_lo = 0.;
1225: sctx.shift_hi = 1.;
1226: }
1228: PetscCall(ISGetIndices(isrow, &r));
1229: PetscCall(ISGetIndices(isicol, &ic));
1231: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1232: ics = ic;
1234: node_max = a->inode.node_count;
1235: ns = a->inode.size;
1236: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1238: /* If max inode size > 4, split it into two inodes.*/
1239: /* also map the inode sizes according to the ordering */
1240: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1241: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1242: if (ns[i] > 4) {
1243: tmp_vec1[j] = 4;
1244: ++j;
1245: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1246: } else {
1247: tmp_vec1[j] = ns[i];
1248: }
1249: }
1250: /* Use the correct node_max */
1251: node_max = j;
1253: /* Now reorder the inode info based on mat re-ordering info */
1254: /* First create a row -> inode_size_array_index map */
1255: PetscCall(PetscMalloc1(n + 1, &nsmap));
1256: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1257: for (i = 0, row = 0; i < node_max; i++) {
1258: nodesz = tmp_vec1[i];
1259: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1260: }
1261: /* Using nsmap, create a reordered ns structure */
1262: for (i = 0, j = 0; i < node_max; i++) {
1263: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1264: tmp_vec2[i] = nodesz;
1265: j += nodesz;
1266: }
1267: PetscCall(PetscFree(nsmap));
1268: PetscCall(PetscFree(tmp_vec1));
1270: /* Now use the correct ns */
1271: ns = tmp_vec2;
1273: do {
1274: sctx.newshift = PETSC_FALSE;
1275: /* Now loop over each block-row, and do the factorization */
1276: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1277: nodesz = ns[inod];
1279: switch (nodesz) {
1280: case 1:
1281: /* zero rtmp1 */
1282: /* L part */
1283: nz = bi[i + 1] - bi[i];
1284: bjtmp = bj + bi[i];
1285: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1287: /* U part */
1288: nz = bdiag[i] - bdiag[i + 1];
1289: bjtmp = bj + bdiag[i + 1] + 1;
1290: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1292: /* load in initial (unfactored row) */
1293: nz = ai[r[i] + 1] - ai[r[i]];
1294: ajtmp = aj + ai[r[i]];
1295: v = aa + ai[r[i]];
1296: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1298: /* ZeropivotApply() */
1299: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1301: /* elimination */
1302: bjtmp = bj + bi[i];
1303: row = *bjtmp++;
1304: nzL = bi[i + 1] - bi[i];
1305: for (k = 0; k < nzL; k++) {
1306: pc = rtmp1 + row;
1307: if (*pc != 0.0) {
1308: pv = b->a + bdiag[row];
1309: mul1 = *pc * (*pv);
1310: *pc = mul1;
1311: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1312: pv = b->a + bdiag[row + 1] + 1;
1313: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1314: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1315: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1316: }
1317: row = *bjtmp++;
1318: }
1320: /* finished row so stick it into b->a */
1321: rs = 0.0;
1322: /* L part */
1323: pv = b->a + bi[i];
1324: pj = b->j + bi[i];
1325: nz = bi[i + 1] - bi[i];
1326: for (j = 0; j < nz; j++) {
1327: pv[j] = rtmp1[pj[j]];
1328: rs += PetscAbsScalar(pv[j]);
1329: }
1331: /* U part */
1332: pv = b->a + bdiag[i + 1] + 1;
1333: pj = b->j + bdiag[i + 1] + 1;
1334: nz = bdiag[i] - bdiag[i + 1] - 1;
1335: for (j = 0; j < nz; j++) {
1336: pv[j] = rtmp1[pj[j]];
1337: rs += PetscAbsScalar(pv[j]);
1338: }
1340: /* Check zero pivot */
1341: sctx.rs = rs;
1342: sctx.pv = rtmp1[i];
1343: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1344: if (sctx.newshift) break;
1346: /* Mark diagonal and invert diagonal for simpler triangular solves */
1347: pv = b->a + bdiag[i];
1348: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1349: break;
1351: case 2:
1352: /* zero rtmp1 and rtmp2 */
1353: /* L part */
1354: nz = bi[i + 1] - bi[i];
1355: bjtmp = bj + bi[i];
1356: for (j = 0; j < nz; j++) {
1357: col = bjtmp[j];
1358: rtmp1[col] = 0.0;
1359: rtmp2[col] = 0.0;
1360: }
1362: /* U part */
1363: nz = bdiag[i] - bdiag[i + 1];
1364: bjtmp = bj + bdiag[i + 1] + 1;
1365: for (j = 0; j < nz; j++) {
1366: col = bjtmp[j];
1367: rtmp1[col] = 0.0;
1368: rtmp2[col] = 0.0;
1369: }
1371: /* load in initial (unfactored row) */
1372: nz = ai[r[i] + 1] - ai[r[i]];
1373: ajtmp = aj + ai[r[i]];
1374: v1 = aa + ai[r[i]];
1375: v2 = aa + ai[r[i] + 1];
1376: for (j = 0; j < nz; j++) {
1377: col = ics[ajtmp[j]];
1378: rtmp1[col] = v1[j];
1379: rtmp2[col] = v2[j];
1380: }
1381: /* ZeropivotApply(): shift the diagonal of the matrix */
1382: rtmp1[i] += sctx.shift_amount;
1383: rtmp2[i + 1] += sctx.shift_amount;
1385: /* elimination */
1386: bjtmp = bj + bi[i];
1387: row = *bjtmp++; /* pivot row */
1388: nzL = bi[i + 1] - bi[i];
1389: for (k = 0; k < nzL; k++) {
1390: pc1 = rtmp1 + row;
1391: pc2 = rtmp2 + row;
1392: if (*pc1 != 0.0 || *pc2 != 0.0) {
1393: pv = b->a + bdiag[row];
1394: mul1 = *pc1 * (*pv);
1395: mul2 = *pc2 * (*pv);
1396: *pc1 = mul1;
1397: *pc2 = mul2;
1399: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1400: pv = b->a + bdiag[row + 1] + 1;
1401: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1402: for (j = 0; j < nz; j++) {
1403: col = pj[j];
1404: rtmp1[col] -= mul1 * pv[j];
1405: rtmp2[col] -= mul2 * pv[j];
1406: }
1407: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1408: }
1409: row = *bjtmp++;
1410: }
1412: /* finished row i; check zero pivot, then stick row i into b->a */
1413: rs = 0.0;
1414: /* L part */
1415: pc1 = b->a + bi[i];
1416: pj = b->j + bi[i];
1417: nz = bi[i + 1] - bi[i];
1418: for (j = 0; j < nz; j++) {
1419: col = pj[j];
1420: pc1[j] = rtmp1[col];
1421: rs += PetscAbsScalar(pc1[j]);
1422: }
1423: /* U part */
1424: pc1 = b->a + bdiag[i + 1] + 1;
1425: pj = b->j + bdiag[i + 1] + 1;
1426: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1427: for (j = 0; j < nz; j++) {
1428: col = pj[j];
1429: pc1[j] = rtmp1[col];
1430: rs += PetscAbsScalar(pc1[j]);
1431: }
1433: sctx.rs = rs;
1434: sctx.pv = rtmp1[i];
1435: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1436: if (sctx.newshift) break;
1437: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1438: *pc1 = 1.0 / sctx.pv;
1440: /* Now take care of diagonal 2x2 block. */
1441: pc2 = rtmp2 + i;
1442: if (*pc2 != 0.0) {
1443: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1444: *pc2 = mul1; /* insert L entry */
1445: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1446: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1447: for (j = 0; j < nz; j++) {
1448: col = pj[j];
1449: rtmp2[col] -= mul1 * rtmp1[col];
1450: }
1451: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1452: }
1454: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1455: rs = 0.0;
1456: /* L part */
1457: pc2 = b->a + bi[i + 1];
1458: pj = b->j + bi[i + 1];
1459: nz = bi[i + 2] - bi[i + 1];
1460: for (j = 0; j < nz; j++) {
1461: col = pj[j];
1462: pc2[j] = rtmp2[col];
1463: rs += PetscAbsScalar(pc2[j]);
1464: }
1465: /* U part */
1466: pc2 = b->a + bdiag[i + 2] + 1;
1467: pj = b->j + bdiag[i + 2] + 1;
1468: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1469: for (j = 0; j < nz; j++) {
1470: col = pj[j];
1471: pc2[j] = rtmp2[col];
1472: rs += PetscAbsScalar(pc2[j]);
1473: }
1475: sctx.rs = rs;
1476: sctx.pv = rtmp2[i + 1];
1477: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1478: if (sctx.newshift) break;
1479: pc2 = b->a + bdiag[i + 1];
1480: *pc2 = 1.0 / sctx.pv;
1481: break;
1483: case 3:
1484: /* zero rtmp */
1485: /* L part */
1486: nz = bi[i + 1] - bi[i];
1487: bjtmp = bj + bi[i];
1488: for (j = 0; j < nz; j++) {
1489: col = bjtmp[j];
1490: rtmp1[col] = 0.0;
1491: rtmp2[col] = 0.0;
1492: rtmp3[col] = 0.0;
1493: }
1495: /* U part */
1496: nz = bdiag[i] - bdiag[i + 1];
1497: bjtmp = bj + bdiag[i + 1] + 1;
1498: for (j = 0; j < nz; j++) {
1499: col = bjtmp[j];
1500: rtmp1[col] = 0.0;
1501: rtmp2[col] = 0.0;
1502: rtmp3[col] = 0.0;
1503: }
1505: /* load in initial (unfactored row) */
1506: nz = ai[r[i] + 1] - ai[r[i]];
1507: ajtmp = aj + ai[r[i]];
1508: v1 = aa + ai[r[i]];
1509: v2 = aa + ai[r[i] + 1];
1510: v3 = aa + ai[r[i] + 2];
1511: for (j = 0; j < nz; j++) {
1512: col = ics[ajtmp[j]];
1513: rtmp1[col] = v1[j];
1514: rtmp2[col] = v2[j];
1515: rtmp3[col] = v3[j];
1516: }
1517: /* ZeropivotApply(): shift the diagonal of the matrix */
1518: rtmp1[i] += sctx.shift_amount;
1519: rtmp2[i + 1] += sctx.shift_amount;
1520: rtmp3[i + 2] += sctx.shift_amount;
1522: /* elimination */
1523: bjtmp = bj + bi[i];
1524: row = *bjtmp++; /* pivot row */
1525: nzL = bi[i + 1] - bi[i];
1526: for (k = 0; k < nzL; k++) {
1527: pc1 = rtmp1 + row;
1528: pc2 = rtmp2 + row;
1529: pc3 = rtmp3 + row;
1530: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1531: pv = b->a + bdiag[row];
1532: mul1 = *pc1 * (*pv);
1533: mul2 = *pc2 * (*pv);
1534: mul3 = *pc3 * (*pv);
1535: *pc1 = mul1;
1536: *pc2 = mul2;
1537: *pc3 = mul3;
1539: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1540: pv = b->a + bdiag[row + 1] + 1;
1541: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1542: for (j = 0; j < nz; j++) {
1543: col = pj[j];
1544: rtmp1[col] -= mul1 * pv[j];
1545: rtmp2[col] -= mul2 * pv[j];
1546: rtmp3[col] -= mul3 * pv[j];
1547: }
1548: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1549: }
1550: row = *bjtmp++;
1551: }
1553: /* finished row i; check zero pivot, then stick row i into b->a */
1554: rs = 0.0;
1555: /* L part */
1556: pc1 = b->a + bi[i];
1557: pj = b->j + bi[i];
1558: nz = bi[i + 1] - bi[i];
1559: for (j = 0; j < nz; j++) {
1560: col = pj[j];
1561: pc1[j] = rtmp1[col];
1562: rs += PetscAbsScalar(pc1[j]);
1563: }
1564: /* U part */
1565: pc1 = b->a + bdiag[i + 1] + 1;
1566: pj = b->j + bdiag[i + 1] + 1;
1567: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1568: for (j = 0; j < nz; j++) {
1569: col = pj[j];
1570: pc1[j] = rtmp1[col];
1571: rs += PetscAbsScalar(pc1[j]);
1572: }
1574: sctx.rs = rs;
1575: sctx.pv = rtmp1[i];
1576: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1577: if (sctx.newshift) break;
1578: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1579: *pc1 = 1.0 / sctx.pv;
1581: /* Now take care of 1st column of diagonal 3x3 block. */
1582: pc2 = rtmp2 + i;
1583: pc3 = rtmp3 + i;
1584: if (*pc2 != 0.0 || *pc3 != 0.0) {
1585: mul2 = (*pc2) * (*pc1);
1586: *pc2 = mul2;
1587: mul3 = (*pc3) * (*pc1);
1588: *pc3 = mul3;
1589: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1590: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1591: for (j = 0; j < nz; j++) {
1592: col = pj[j];
1593: rtmp2[col] -= mul2 * rtmp1[col];
1594: rtmp3[col] -= mul3 * rtmp1[col];
1595: }
1596: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1597: }
1599: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1600: rs = 0.0;
1601: /* L part */
1602: pc2 = b->a + bi[i + 1];
1603: pj = b->j + bi[i + 1];
1604: nz = bi[i + 2] - bi[i + 1];
1605: for (j = 0; j < nz; j++) {
1606: col = pj[j];
1607: pc2[j] = rtmp2[col];
1608: rs += PetscAbsScalar(pc2[j]);
1609: }
1610: /* U part */
1611: pc2 = b->a + bdiag[i + 2] + 1;
1612: pj = b->j + bdiag[i + 2] + 1;
1613: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1614: for (j = 0; j < nz; j++) {
1615: col = pj[j];
1616: pc2[j] = rtmp2[col];
1617: rs += PetscAbsScalar(pc2[j]);
1618: }
1620: sctx.rs = rs;
1621: sctx.pv = rtmp2[i + 1];
1622: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1623: if (sctx.newshift) break;
1624: pc2 = b->a + bdiag[i + 1];
1625: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1627: /* Now take care of 2nd column of diagonal 3x3 block. */
1628: pc3 = rtmp3 + i + 1;
1629: if (*pc3 != 0.0) {
1630: mul3 = (*pc3) * (*pc2);
1631: *pc3 = mul3;
1632: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1633: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1634: for (j = 0; j < nz; j++) {
1635: col = pj[j];
1636: rtmp3[col] -= mul3 * rtmp2[col];
1637: }
1638: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1639: }
1641: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1642: rs = 0.0;
1643: /* L part */
1644: pc3 = b->a + bi[i + 2];
1645: pj = b->j + bi[i + 2];
1646: nz = bi[i + 3] - bi[i + 2];
1647: for (j = 0; j < nz; j++) {
1648: col = pj[j];
1649: pc3[j] = rtmp3[col];
1650: rs += PetscAbsScalar(pc3[j]);
1651: }
1652: /* U part */
1653: pc3 = b->a + bdiag[i + 3] + 1;
1654: pj = b->j + bdiag[i + 3] + 1;
1655: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1656: for (j = 0; j < nz; j++) {
1657: col = pj[j];
1658: pc3[j] = rtmp3[col];
1659: rs += PetscAbsScalar(pc3[j]);
1660: }
1662: sctx.rs = rs;
1663: sctx.pv = rtmp3[i + 2];
1664: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1665: if (sctx.newshift) break;
1666: pc3 = b->a + bdiag[i + 2];
1667: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1668: break;
1669: case 4:
1670: /* zero rtmp */
1671: /* L part */
1672: nz = bi[i + 1] - bi[i];
1673: bjtmp = bj + bi[i];
1674: for (j = 0; j < nz; j++) {
1675: col = bjtmp[j];
1676: rtmp1[col] = 0.0;
1677: rtmp2[col] = 0.0;
1678: rtmp3[col] = 0.0;
1679: rtmp4[col] = 0.0;
1680: }
1682: /* U part */
1683: nz = bdiag[i] - bdiag[i + 1];
1684: bjtmp = bj + bdiag[i + 1] + 1;
1685: for (j = 0; j < nz; j++) {
1686: col = bjtmp[j];
1687: rtmp1[col] = 0.0;
1688: rtmp2[col] = 0.0;
1689: rtmp3[col] = 0.0;
1690: rtmp4[col] = 0.0;
1691: }
1693: /* load in initial (unfactored row) */
1694: nz = ai[r[i] + 1] - ai[r[i]];
1695: ajtmp = aj + ai[r[i]];
1696: v1 = aa + ai[r[i]];
1697: v2 = aa + ai[r[i] + 1];
1698: v3 = aa + ai[r[i] + 2];
1699: v4 = aa + ai[r[i] + 3];
1700: for (j = 0; j < nz; j++) {
1701: col = ics[ajtmp[j]];
1702: rtmp1[col] = v1[j];
1703: rtmp2[col] = v2[j];
1704: rtmp3[col] = v3[j];
1705: rtmp4[col] = v4[j];
1706: }
1707: /* ZeropivotApply(): shift the diagonal of the matrix */
1708: rtmp1[i] += sctx.shift_amount;
1709: rtmp2[i + 1] += sctx.shift_amount;
1710: rtmp3[i + 2] += sctx.shift_amount;
1711: rtmp4[i + 3] += sctx.shift_amount;
1713: /* elimination */
1714: bjtmp = bj + bi[i];
1715: row = *bjtmp++; /* pivot row */
1716: nzL = bi[i + 1] - bi[i];
1717: for (k = 0; k < nzL; k++) {
1718: pc1 = rtmp1 + row;
1719: pc2 = rtmp2 + row;
1720: pc3 = rtmp3 + row;
1721: pc4 = rtmp4 + row;
1722: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723: pv = b->a + bdiag[row];
1724: mul1 = *pc1 * (*pv);
1725: mul2 = *pc2 * (*pv);
1726: mul3 = *pc3 * (*pv);
1727: mul4 = *pc4 * (*pv);
1728: *pc1 = mul1;
1729: *pc2 = mul2;
1730: *pc3 = mul3;
1731: *pc4 = mul4;
1733: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1734: pv = b->a + bdiag[row + 1] + 1;
1735: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1736: for (j = 0; j < nz; j++) {
1737: col = pj[j];
1738: rtmp1[col] -= mul1 * pv[j];
1739: rtmp2[col] -= mul2 * pv[j];
1740: rtmp3[col] -= mul3 * pv[j];
1741: rtmp4[col] -= mul4 * pv[j];
1742: }
1743: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1744: }
1745: row = *bjtmp++;
1746: }
1748: /* finished row i; check zero pivot, then stick row i into b->a */
1749: rs = 0.0;
1750: /* L part */
1751: pc1 = b->a + bi[i];
1752: pj = b->j + bi[i];
1753: nz = bi[i + 1] - bi[i];
1754: for (j = 0; j < nz; j++) {
1755: col = pj[j];
1756: pc1[j] = rtmp1[col];
1757: rs += PetscAbsScalar(pc1[j]);
1758: }
1759: /* U part */
1760: pc1 = b->a + bdiag[i + 1] + 1;
1761: pj = b->j + bdiag[i + 1] + 1;
1762: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1763: for (j = 0; j < nz; j++) {
1764: col = pj[j];
1765: pc1[j] = rtmp1[col];
1766: rs += PetscAbsScalar(pc1[j]);
1767: }
1769: sctx.rs = rs;
1770: sctx.pv = rtmp1[i];
1771: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1772: if (sctx.newshift) break;
1773: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1774: *pc1 = 1.0 / sctx.pv;
1776: /* Now take care of 1st column of diagonal 4x4 block. */
1777: pc2 = rtmp2 + i;
1778: pc3 = rtmp3 + i;
1779: pc4 = rtmp4 + i;
1780: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1781: mul2 = (*pc2) * (*pc1);
1782: *pc2 = mul2;
1783: mul3 = (*pc3) * (*pc1);
1784: *pc3 = mul3;
1785: mul4 = (*pc4) * (*pc1);
1786: *pc4 = mul4;
1787: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1788: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1789: for (j = 0; j < nz; j++) {
1790: col = pj[j];
1791: rtmp2[col] -= mul2 * rtmp1[col];
1792: rtmp3[col] -= mul3 * rtmp1[col];
1793: rtmp4[col] -= mul4 * rtmp1[col];
1794: }
1795: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1796: }
1798: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1799: rs = 0.0;
1800: /* L part */
1801: pc2 = b->a + bi[i + 1];
1802: pj = b->j + bi[i + 1];
1803: nz = bi[i + 2] - bi[i + 1];
1804: for (j = 0; j < nz; j++) {
1805: col = pj[j];
1806: pc2[j] = rtmp2[col];
1807: rs += PetscAbsScalar(pc2[j]);
1808: }
1809: /* U part */
1810: pc2 = b->a + bdiag[i + 2] + 1;
1811: pj = b->j + bdiag[i + 2] + 1;
1812: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1813: for (j = 0; j < nz; j++) {
1814: col = pj[j];
1815: pc2[j] = rtmp2[col];
1816: rs += PetscAbsScalar(pc2[j]);
1817: }
1819: sctx.rs = rs;
1820: sctx.pv = rtmp2[i + 1];
1821: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1822: if (sctx.newshift) break;
1823: pc2 = b->a + bdiag[i + 1];
1824: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1826: /* Now take care of 2nd column of diagonal 4x4 block. */
1827: pc3 = rtmp3 + i + 1;
1828: pc4 = rtmp4 + i + 1;
1829: if (*pc3 != 0.0 || *pc4 != 0.0) {
1830: mul3 = (*pc3) * (*pc2);
1831: *pc3 = mul3;
1832: mul4 = (*pc4) * (*pc2);
1833: *pc4 = mul4;
1834: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1835: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1836: for (j = 0; j < nz; j++) {
1837: col = pj[j];
1838: rtmp3[col] -= mul3 * rtmp2[col];
1839: rtmp4[col] -= mul4 * rtmp2[col];
1840: }
1841: PetscCall(PetscLogFlops(4.0 * nz));
1842: }
1844: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1845: rs = 0.0;
1846: /* L part */
1847: pc3 = b->a + bi[i + 2];
1848: pj = b->j + bi[i + 2];
1849: nz = bi[i + 3] - bi[i + 2];
1850: for (j = 0; j < nz; j++) {
1851: col = pj[j];
1852: pc3[j] = rtmp3[col];
1853: rs += PetscAbsScalar(pc3[j]);
1854: }
1855: /* U part */
1856: pc3 = b->a + bdiag[i + 3] + 1;
1857: pj = b->j + bdiag[i + 3] + 1;
1858: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1859: for (j = 0; j < nz; j++) {
1860: col = pj[j];
1861: pc3[j] = rtmp3[col];
1862: rs += PetscAbsScalar(pc3[j]);
1863: }
1865: sctx.rs = rs;
1866: sctx.pv = rtmp3[i + 2];
1867: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1868: if (sctx.newshift) break;
1869: pc3 = b->a + bdiag[i + 2];
1870: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1872: /* Now take care of 3rd column of diagonal 4x4 block. */
1873: pc4 = rtmp4 + i + 2;
1874: if (*pc4 != 0.0) {
1875: mul4 = (*pc4) * (*pc3);
1876: *pc4 = mul4;
1877: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1878: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1879: for (j = 0; j < nz; j++) {
1880: col = pj[j];
1881: rtmp4[col] -= mul4 * rtmp3[col];
1882: }
1883: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1884: }
1886: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1887: rs = 0.0;
1888: /* L part */
1889: pc4 = b->a + bi[i + 3];
1890: pj = b->j + bi[i + 3];
1891: nz = bi[i + 4] - bi[i + 3];
1892: for (j = 0; j < nz; j++) {
1893: col = pj[j];
1894: pc4[j] = rtmp4[col];
1895: rs += PetscAbsScalar(pc4[j]);
1896: }
1897: /* U part */
1898: pc4 = b->a + bdiag[i + 4] + 1;
1899: pj = b->j + bdiag[i + 4] + 1;
1900: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1901: for (j = 0; j < nz; j++) {
1902: col = pj[j];
1903: pc4[j] = rtmp4[col];
1904: rs += PetscAbsScalar(pc4[j]);
1905: }
1907: sctx.rs = rs;
1908: sctx.pv = rtmp4[i + 3];
1909: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1910: if (sctx.newshift) break;
1911: pc4 = b->a + bdiag[i + 3];
1912: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1913: break;
1915: default:
1916: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1917: }
1918: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1919: i += nodesz; /* Update the row */
1920: }
1922: /* MatPivotRefine() */
1923: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1924: /*
1925: * if no shift in this attempt & shifting & started shifting & can refine,
1926: * then try lower shift
1927: */
1928: sctx.shift_hi = sctx.shift_fraction;
1929: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1930: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1931: sctx.newshift = PETSC_TRUE;
1932: sctx.nshift++;
1933: }
1934: } while (sctx.newshift);
1936: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1937: PetscCall(PetscFree(tmp_vec2));
1938: PetscCall(ISRestoreIndices(isicol, &ic));
1939: PetscCall(ISRestoreIndices(isrow, &r));
1941: if (b->inode.size) {
1942: C->ops->solve = MatSolve_SeqAIJ_Inode;
1943: } else {
1944: C->ops->solve = MatSolve_SeqAIJ;
1945: }
1946: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1947: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1948: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1949: C->ops->matsolve = MatMatSolve_SeqAIJ;
1950: C->assembled = PETSC_TRUE;
1951: C->preallocated = PETSC_TRUE;
1953: PetscCall(PetscLogFlops(C->cmap->n));
1955: /* MatShiftView(A,info,&sctx) */
1956: if (sctx.nshift) {
1957: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1958: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1959: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1960: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1961: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1962: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1963: }
1964: }
1965: PetscFunctionReturn(PETSC_SUCCESS);
1966: }
1968: #if 0
1969: // unused
1970: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1971: {
1972: Mat C = B;
1973: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1974: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1975: const PetscInt *r, *ic, *c, *ics;
1976: PetscInt n = A->rmap->n, *bi = b->i;
1977: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1978: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1979: PetscInt *ai = a->i, *aj = a->j;
1980: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1981: PetscScalar mul1, mul2, mul3, tmp;
1982: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1983: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1984: PetscReal rs = 0.0;
1985: FactorShiftCtx sctx;
1987: PetscFunctionBegin;
1988: sctx.shift_top = 0;
1989: sctx.nshift_max = 0;
1990: sctx.shift_lo = 0;
1991: sctx.shift_hi = 0;
1992: sctx.shift_fraction = 0;
1994: /* if both shift schemes are chosen by user, only use info->shiftpd */
1995: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1996: sctx.shift_top = 0;
1997: for (i = 0; i < n; i++) {
1998: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1999: rs = 0.0;
2000: ajtmp = aj + ai[i];
2001: rtmp1 = aa + ai[i];
2002: nz = ai[i + 1] - ai[i];
2003: for (j = 0; j < nz; j++) {
2004: if (*ajtmp != i) {
2005: rs += PetscAbsScalar(*rtmp1++);
2006: } else {
2007: rs -= PetscRealPart(*rtmp1++);
2008: }
2009: ajtmp++;
2010: }
2011: if (rs > sctx.shift_top) sctx.shift_top = rs;
2012: }
2013: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2014: sctx.shift_top *= 1.1;
2015: sctx.nshift_max = 5;
2016: sctx.shift_lo = 0.;
2017: sctx.shift_hi = 1.;
2018: }
2019: sctx.shift_amount = 0;
2020: sctx.nshift = 0;
2022: PetscCall(ISGetIndices(isrow, &r));
2023: PetscCall(ISGetIndices(iscol, &c));
2024: PetscCall(ISGetIndices(isicol, &ic));
2025: PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2026: ics = ic;
2028: node_max = a->inode.node_count;
2029: ns = a->inode.size;
2030: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2032: /* If max inode size > 3, split it into two inodes.*/
2033: /* also map the inode sizes according to the ordering */
2034: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2035: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2036: if (ns[i] > 3) {
2037: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2038: ++j;
2039: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2040: } else {
2041: tmp_vec1[j] = ns[i];
2042: }
2043: }
2044: /* Use the correct node_max */
2045: node_max = j;
2047: /* Now reorder the inode info based on mat re-ordering info */
2048: /* First create a row -> inode_size_array_index map */
2049: PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2050: for (i = 0, row = 0; i < node_max; i++) {
2051: nodesz = tmp_vec1[i];
2052: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2053: }
2054: /* Using nsmap, create a reordered ns structure */
2055: for (i = 0, j = 0; i < node_max; i++) {
2056: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2057: tmp_vec2[i] = nodesz;
2058: j += nodesz;
2059: }
2060: PetscCall(PetscFree2(nsmap, tmp_vec1));
2061: /* Now use the correct ns */
2062: ns = tmp_vec2;
2064: do {
2065: sctx.newshift = PETSC_FALSE;
2066: /* Now loop over each block-row, and do the factorization */
2067: for (i = 0, row = 0; i < node_max; i++) {
2068: nodesz = ns[i];
2069: nz = bi[row + 1] - bi[row];
2070: bjtmp = bj + bi[row];
2072: switch (nodesz) {
2073: case 1:
2074: for (j = 0; j < nz; j++) {
2075: idx = bjtmp[j];
2076: rtmp11[idx] = 0.0;
2077: }
2079: /* load in initial (unfactored row) */
2080: idx = r[row];
2081: nz_tmp = ai[idx + 1] - ai[idx];
2082: ajtmp = aj + ai[idx];
2083: v1 = aa + ai[idx];
2085: for (j = 0; j < nz_tmp; j++) {
2086: idx = ics[ajtmp[j]];
2087: rtmp11[idx] = v1[j];
2088: }
2089: rtmp11[ics[r[row]]] += sctx.shift_amount;
2091: prow = *bjtmp++;
2092: while (prow < row) {
2093: pc1 = rtmp11 + prow;
2094: if (*pc1 != 0.0) {
2095: pv = ba + bd[prow];
2096: pj = nbj + bd[prow];
2097: mul1 = *pc1 * *pv++;
2098: *pc1 = mul1;
2099: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2100: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2101: for (j = 0; j < nz_tmp; j++) {
2102: tmp = pv[j];
2103: idx = pj[j];
2104: rtmp11[idx] -= mul1 * tmp;
2105: }
2106: }
2107: prow = *bjtmp++;
2108: }
2109: pj = bj + bi[row];
2110: pc1 = ba + bi[row];
2112: sctx.pv = rtmp11[row];
2113: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2114: rs = 0.0;
2115: for (j = 0; j < nz; j++) {
2116: idx = pj[j];
2117: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2118: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2119: }
2120: sctx.rs = rs;
2121: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2122: if (sctx.newshift) goto endofwhile;
2123: break;
2125: case 2:
2126: for (j = 0; j < nz; j++) {
2127: idx = bjtmp[j];
2128: rtmp11[idx] = 0.0;
2129: rtmp22[idx] = 0.0;
2130: }
2132: /* load in initial (unfactored row) */
2133: idx = r[row];
2134: nz_tmp = ai[idx + 1] - ai[idx];
2135: ajtmp = aj + ai[idx];
2136: v1 = aa + ai[idx];
2137: v2 = aa + ai[idx + 1];
2138: for (j = 0; j < nz_tmp; j++) {
2139: idx = ics[ajtmp[j]];
2140: rtmp11[idx] = v1[j];
2141: rtmp22[idx] = v2[j];
2142: }
2143: rtmp11[ics[r[row]]] += sctx.shift_amount;
2144: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2146: prow = *bjtmp++;
2147: while (prow < row) {
2148: pc1 = rtmp11 + prow;
2149: pc2 = rtmp22 + prow;
2150: if (*pc1 != 0.0 || *pc2 != 0.0) {
2151: pv = ba + bd[prow];
2152: pj = nbj + bd[prow];
2153: mul1 = *pc1 * *pv;
2154: mul2 = *pc2 * *pv;
2155: ++pv;
2156: *pc1 = mul1;
2157: *pc2 = mul2;
2159: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2160: for (j = 0; j < nz_tmp; j++) {
2161: tmp = pv[j];
2162: idx = pj[j];
2163: rtmp11[idx] -= mul1 * tmp;
2164: rtmp22[idx] -= mul2 * tmp;
2165: }
2166: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2167: }
2168: prow = *bjtmp++;
2169: }
2171: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2172: pc1 = rtmp11 + prow;
2173: pc2 = rtmp22 + prow;
2175: sctx.pv = *pc1;
2176: pj = bj + bi[prow];
2177: rs = 0.0;
2178: for (j = 0; j < nz; j++) {
2179: idx = pj[j];
2180: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2181: }
2182: sctx.rs = rs;
2183: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2184: if (sctx.newshift) goto endofwhile;
2186: if (*pc2 != 0.0) {
2187: pj = nbj + bd[prow];
2188: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2189: *pc2 = mul2;
2190: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2191: for (j = 0; j < nz_tmp; j++) {
2192: idx = pj[j];
2193: tmp = rtmp11[idx];
2194: rtmp22[idx] -= mul2 * tmp;
2195: }
2196: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2197: }
2199: pj = bj + bi[row];
2200: pc1 = ba + bi[row];
2201: pc2 = ba + bi[row + 1];
2203: sctx.pv = rtmp22[row + 1];
2204: rs = 0.0;
2205: rtmp11[row] = 1.0 / rtmp11[row];
2206: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2207: /* copy row entries from dense representation to sparse */
2208: for (j = 0; j < nz; j++) {
2209: idx = pj[j];
2210: pc1[j] = rtmp11[idx];
2211: pc2[j] = rtmp22[idx];
2212: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2213: }
2214: sctx.rs = rs;
2215: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2216: if (sctx.newshift) goto endofwhile;
2217: break;
2219: case 3:
2220: for (j = 0; j < nz; j++) {
2221: idx = bjtmp[j];
2222: rtmp11[idx] = 0.0;
2223: rtmp22[idx] = 0.0;
2224: rtmp33[idx] = 0.0;
2225: }
2226: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2227: idx = r[row];
2228: nz_tmp = ai[idx + 1] - ai[idx];
2229: ajtmp = aj + ai[idx];
2230: v1 = aa + ai[idx];
2231: v2 = aa + ai[idx + 1];
2232: v3 = aa + ai[idx + 2];
2233: for (j = 0; j < nz_tmp; j++) {
2234: idx = ics[ajtmp[j]];
2235: rtmp11[idx] = v1[j];
2236: rtmp22[idx] = v2[j];
2237: rtmp33[idx] = v3[j];
2238: }
2239: rtmp11[ics[r[row]]] += sctx.shift_amount;
2240: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2241: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2243: /* loop over all pivot row blocks above this row block */
2244: prow = *bjtmp++;
2245: while (prow < row) {
2246: pc1 = rtmp11 + prow;
2247: pc2 = rtmp22 + prow;
2248: pc3 = rtmp33 + prow;
2249: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2250: pv = ba + bd[prow];
2251: pj = nbj + bd[prow];
2252: mul1 = *pc1 * *pv;
2253: mul2 = *pc2 * *pv;
2254: mul3 = *pc3 * *pv;
2255: ++pv;
2256: *pc1 = mul1;
2257: *pc2 = mul2;
2258: *pc3 = mul3;
2260: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2261: /* update this row based on pivot row */
2262: for (j = 0; j < nz_tmp; j++) {
2263: tmp = pv[j];
2264: idx = pj[j];
2265: rtmp11[idx] -= mul1 * tmp;
2266: rtmp22[idx] -= mul2 * tmp;
2267: rtmp33[idx] -= mul3 * tmp;
2268: }
2269: PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2270: }
2271: prow = *bjtmp++;
2272: }
2274: /* Now take care of diagonal 3x3 block in this set of rows */
2275: /* note: prow = row here */
2276: pc1 = rtmp11 + prow;
2277: pc2 = rtmp22 + prow;
2278: pc3 = rtmp33 + prow;
2280: sctx.pv = *pc1;
2281: pj = bj + bi[prow];
2282: rs = 0.0;
2283: for (j = 0; j < nz; j++) {
2284: idx = pj[j];
2285: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2286: }
2287: sctx.rs = rs;
2288: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2289: if (sctx.newshift) goto endofwhile;
2291: if (*pc2 != 0.0 || *pc3 != 0.0) {
2292: mul2 = (*pc2) / (*pc1);
2293: mul3 = (*pc3) / (*pc1);
2294: *pc2 = mul2;
2295: *pc3 = mul3;
2296: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2297: pj = nbj + bd[prow];
2298: for (j = 0; j < nz_tmp; j++) {
2299: idx = pj[j];
2300: tmp = rtmp11[idx];
2301: rtmp22[idx] -= mul2 * tmp;
2302: rtmp33[idx] -= mul3 * tmp;
2303: }
2304: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2305: }
2306: ++prow;
2308: pc2 = rtmp22 + prow;
2309: pc3 = rtmp33 + prow;
2310: sctx.pv = *pc2;
2311: pj = bj + bi[prow];
2312: rs = 0.0;
2313: for (j = 0; j < nz; j++) {
2314: idx = pj[j];
2315: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2316: }
2317: sctx.rs = rs;
2318: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2319: if (sctx.newshift) goto endofwhile;
2321: if (*pc3 != 0.0) {
2322: mul3 = (*pc3) / (*pc2);
2323: *pc3 = mul3;
2324: pj = nbj + bd[prow];
2325: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2326: for (j = 0; j < nz_tmp; j++) {
2327: idx = pj[j];
2328: tmp = rtmp22[idx];
2329: rtmp33[idx] -= mul3 * tmp;
2330: }
2331: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2332: }
2334: pj = bj + bi[row];
2335: pc1 = ba + bi[row];
2336: pc2 = ba + bi[row + 1];
2337: pc3 = ba + bi[row + 2];
2339: sctx.pv = rtmp33[row + 2];
2340: rs = 0.0;
2341: rtmp11[row] = 1.0 / rtmp11[row];
2342: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2343: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2344: /* copy row entries from dense representation to sparse */
2345: for (j = 0; j < nz; j++) {
2346: idx = pj[j];
2347: pc1[j] = rtmp11[idx];
2348: pc2[j] = rtmp22[idx];
2349: pc3[j] = rtmp33[idx];
2350: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2351: }
2353: sctx.rs = rs;
2354: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2355: if (sctx.newshift) goto endofwhile;
2356: break;
2358: default:
2359: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2360: }
2361: row += nodesz; /* Update the row */
2362: }
2363: endofwhile:;
2364: } while (sctx.newshift);
2365: PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2366: PetscCall(PetscFree(tmp_vec2));
2367: PetscCall(ISRestoreIndices(isicol, &ic));
2368: PetscCall(ISRestoreIndices(isrow, &r));
2369: PetscCall(ISRestoreIndices(iscol, &c));
2371: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2372: /* do not set solve add, since MatSolve_Inode + Add is faster */
2373: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2374: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2375: C->assembled = PETSC_TRUE;
2376: C->preallocated = PETSC_TRUE;
2377: if (sctx.nshift) {
2378: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2379: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2380: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2381: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2382: }
2383: }
2384: PetscCall(PetscLogFlops(C->cmap->n));
2385: PetscCall(MatSeqAIJCheckInode(C));
2386: PetscFunctionReturn(PETSC_SUCCESS);
2387: }
2388: #endif
2390: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2391: {
2392: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2393: IS iscol = a->col, isrow = a->row;
2394: const PetscInt *r, *c, *rout, *cout;
2395: PetscInt i, j, n = A->rmap->n;
2396: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2397: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2398: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2399: PetscScalar sum1, sum2, sum3, sum4, sum5;
2400: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2401: const PetscScalar *b;
2403: PetscFunctionBegin;
2404: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2405: node_max = a->inode.node_count;
2406: ns = a->inode.size; /* Node Size array */
2408: PetscCall(VecGetArrayRead(bb, &b));
2409: PetscCall(VecGetArrayWrite(xx, &x));
2410: tmp = a->solve_work;
2412: PetscCall(ISGetIndices(isrow, &rout));
2413: r = rout;
2414: PetscCall(ISGetIndices(iscol, &cout));
2415: c = cout;
2417: /* forward solve the lower triangular */
2418: tmps = tmp;
2419: aa = a_a;
2420: aj = a_j;
2421: ad = a->diag;
2423: for (i = 0, row = 0; i < node_max; ++i) {
2424: nsz = ns[i];
2425: aii = ai[row];
2426: v1 = aa + aii;
2427: vi = aj + aii;
2428: nz = ai[row + 1] - ai[row];
2430: if (i < node_max - 1) {
2431: /* Prefetch the indices for the next block */
2432: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2433: /* Prefetch the data for the next block */
2434: PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2435: }
2437: switch (nsz) { /* Each loop in 'case' is unrolled */
2438: case 1:
2439: sum1 = b[r[row]];
2440: for (j = 0; j < nz - 1; j += 2) {
2441: i0 = vi[j];
2442: i1 = vi[j + 1];
2443: tmp0 = tmps[i0];
2444: tmp1 = tmps[i1];
2445: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2446: }
2447: if (j == nz - 1) {
2448: tmp0 = tmps[vi[j]];
2449: sum1 -= v1[j] * tmp0;
2450: }
2451: tmp[row++] = sum1;
2452: break;
2453: case 2:
2454: sum1 = b[r[row]];
2455: sum2 = b[r[row + 1]];
2456: v2 = aa + ai[row + 1];
2458: for (j = 0; j < nz - 1; j += 2) {
2459: i0 = vi[j];
2460: i1 = vi[j + 1];
2461: tmp0 = tmps[i0];
2462: tmp1 = tmps[i1];
2463: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2464: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2465: }
2466: if (j == nz - 1) {
2467: tmp0 = tmps[vi[j]];
2468: sum1 -= v1[j] * tmp0;
2469: sum2 -= v2[j] * tmp0;
2470: }
2471: sum2 -= v2[nz] * sum1;
2472: tmp[row++] = sum1;
2473: tmp[row++] = sum2;
2474: break;
2475: case 3:
2476: sum1 = b[r[row]];
2477: sum2 = b[r[row + 1]];
2478: sum3 = b[r[row + 2]];
2479: v2 = aa + ai[row + 1];
2480: v3 = aa + ai[row + 2];
2482: for (j = 0; j < nz - 1; j += 2) {
2483: i0 = vi[j];
2484: i1 = vi[j + 1];
2485: tmp0 = tmps[i0];
2486: tmp1 = tmps[i1];
2487: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2488: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2489: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2490: }
2491: if (j == nz - 1) {
2492: tmp0 = tmps[vi[j]];
2493: sum1 -= v1[j] * tmp0;
2494: sum2 -= v2[j] * tmp0;
2495: sum3 -= v3[j] * tmp0;
2496: }
2497: sum2 -= v2[nz] * sum1;
2498: sum3 -= v3[nz] * sum1;
2499: sum3 -= v3[nz + 1] * sum2;
2500: tmp[row++] = sum1;
2501: tmp[row++] = sum2;
2502: tmp[row++] = sum3;
2503: break;
2505: case 4:
2506: sum1 = b[r[row]];
2507: sum2 = b[r[row + 1]];
2508: sum3 = b[r[row + 2]];
2509: sum4 = b[r[row + 3]];
2510: v2 = aa + ai[row + 1];
2511: v3 = aa + ai[row + 2];
2512: v4 = aa + ai[row + 3];
2514: for (j = 0; j < nz - 1; j += 2) {
2515: i0 = vi[j];
2516: i1 = vi[j + 1];
2517: tmp0 = tmps[i0];
2518: tmp1 = tmps[i1];
2519: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2520: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2521: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2522: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2523: }
2524: if (j == nz - 1) {
2525: tmp0 = tmps[vi[j]];
2526: sum1 -= v1[j] * tmp0;
2527: sum2 -= v2[j] * tmp0;
2528: sum3 -= v3[j] * tmp0;
2529: sum4 -= v4[j] * tmp0;
2530: }
2531: sum2 -= v2[nz] * sum1;
2532: sum3 -= v3[nz] * sum1;
2533: sum4 -= v4[nz] * sum1;
2534: sum3 -= v3[nz + 1] * sum2;
2535: sum4 -= v4[nz + 1] * sum2;
2536: sum4 -= v4[nz + 2] * sum3;
2538: tmp[row++] = sum1;
2539: tmp[row++] = sum2;
2540: tmp[row++] = sum3;
2541: tmp[row++] = sum4;
2542: break;
2543: case 5:
2544: sum1 = b[r[row]];
2545: sum2 = b[r[row + 1]];
2546: sum3 = b[r[row + 2]];
2547: sum4 = b[r[row + 3]];
2548: sum5 = b[r[row + 4]];
2549: v2 = aa + ai[row + 1];
2550: v3 = aa + ai[row + 2];
2551: v4 = aa + ai[row + 3];
2552: v5 = aa + ai[row + 4];
2554: for (j = 0; j < nz - 1; j += 2) {
2555: i0 = vi[j];
2556: i1 = vi[j + 1];
2557: tmp0 = tmps[i0];
2558: tmp1 = tmps[i1];
2559: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2560: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2561: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2562: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2563: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2564: }
2565: if (j == nz - 1) {
2566: tmp0 = tmps[vi[j]];
2567: sum1 -= v1[j] * tmp0;
2568: sum2 -= v2[j] * tmp0;
2569: sum3 -= v3[j] * tmp0;
2570: sum4 -= v4[j] * tmp0;
2571: sum5 -= v5[j] * tmp0;
2572: }
2574: sum2 -= v2[nz] * sum1;
2575: sum3 -= v3[nz] * sum1;
2576: sum4 -= v4[nz] * sum1;
2577: sum5 -= v5[nz] * sum1;
2578: sum3 -= v3[nz + 1] * sum2;
2579: sum4 -= v4[nz + 1] * sum2;
2580: sum5 -= v5[nz + 1] * sum2;
2581: sum4 -= v4[nz + 2] * sum3;
2582: sum5 -= v5[nz + 2] * sum3;
2583: sum5 -= v5[nz + 3] * sum4;
2585: tmp[row++] = sum1;
2586: tmp[row++] = sum2;
2587: tmp[row++] = sum3;
2588: tmp[row++] = sum4;
2589: tmp[row++] = sum5;
2590: break;
2591: default:
2592: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2593: }
2594: }
2595: /* backward solve the upper triangular */
2596: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2597: nsz = ns[i];
2598: aii = ad[row + 1] + 1;
2599: v1 = aa + aii;
2600: vi = aj + aii;
2601: nz = ad[row] - ad[row + 1] - 1;
2603: if (i > 0) {
2604: /* Prefetch the indices for the next block */
2605: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2606: /* Prefetch the data for the next block */
2607: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2608: }
2610: switch (nsz) { /* Each loop in 'case' is unrolled */
2611: case 1:
2612: sum1 = tmp[row];
2614: for (j = 0; j < nz - 1; j += 2) {
2615: i0 = vi[j];
2616: i1 = vi[j + 1];
2617: tmp0 = tmps[i0];
2618: tmp1 = tmps[i1];
2619: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2620: }
2621: if (j == nz - 1) {
2622: tmp0 = tmps[vi[j]];
2623: sum1 -= v1[j] * tmp0;
2624: }
2625: x[c[row]] = tmp[row] = sum1 * v1[nz];
2626: row--;
2627: break;
2628: case 2:
2629: sum1 = tmp[row];
2630: sum2 = tmp[row - 1];
2631: v2 = aa + ad[row] + 1;
2632: for (j = 0; j < nz - 1; j += 2) {
2633: i0 = vi[j];
2634: i1 = vi[j + 1];
2635: tmp0 = tmps[i0];
2636: tmp1 = tmps[i1];
2637: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2638: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2639: }
2640: if (j == nz - 1) {
2641: tmp0 = tmps[vi[j]];
2642: sum1 -= v1[j] * tmp0;
2643: sum2 -= v2[j + 1] * tmp0;
2644: }
2646: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2647: row--;
2648: sum2 -= v2[0] * tmp0;
2649: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2650: row--;
2651: break;
2652: case 3:
2653: sum1 = tmp[row];
2654: sum2 = tmp[row - 1];
2655: sum3 = tmp[row - 2];
2656: v2 = aa + ad[row] + 1;
2657: v3 = aa + ad[row - 1] + 1;
2658: for (j = 0; j < nz - 1; j += 2) {
2659: i0 = vi[j];
2660: i1 = vi[j + 1];
2661: tmp0 = tmps[i0];
2662: tmp1 = tmps[i1];
2663: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2664: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2665: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2666: }
2667: if (j == nz - 1) {
2668: tmp0 = tmps[vi[j]];
2669: sum1 -= v1[j] * tmp0;
2670: sum2 -= v2[j + 1] * tmp0;
2671: sum3 -= v3[j + 2] * tmp0;
2672: }
2673: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2674: row--;
2675: sum2 -= v2[0] * tmp0;
2676: sum3 -= v3[1] * tmp0;
2677: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2678: row--;
2679: sum3 -= v3[0] * tmp0;
2680: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2681: row--;
2683: break;
2684: case 4:
2685: sum1 = tmp[row];
2686: sum2 = tmp[row - 1];
2687: sum3 = tmp[row - 2];
2688: sum4 = tmp[row - 3];
2689: v2 = aa + ad[row] + 1;
2690: v3 = aa + ad[row - 1] + 1;
2691: v4 = aa + ad[row - 2] + 1;
2693: for (j = 0; j < nz - 1; j += 2) {
2694: i0 = vi[j];
2695: i1 = vi[j + 1];
2696: tmp0 = tmps[i0];
2697: tmp1 = tmps[i1];
2698: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2699: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2700: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2701: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2702: }
2703: if (j == nz - 1) {
2704: tmp0 = tmps[vi[j]];
2705: sum1 -= v1[j] * tmp0;
2706: sum2 -= v2[j + 1] * tmp0;
2707: sum3 -= v3[j + 2] * tmp0;
2708: sum4 -= v4[j + 3] * tmp0;
2709: }
2711: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2712: row--;
2713: sum2 -= v2[0] * tmp0;
2714: sum3 -= v3[1] * tmp0;
2715: sum4 -= v4[2] * tmp0;
2716: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2717: row--;
2718: sum3 -= v3[0] * tmp0;
2719: sum4 -= v4[1] * tmp0;
2720: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2721: row--;
2722: sum4 -= v4[0] * tmp0;
2723: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2724: row--;
2725: break;
2726: case 5:
2727: sum1 = tmp[row];
2728: sum2 = tmp[row - 1];
2729: sum3 = tmp[row - 2];
2730: sum4 = tmp[row - 3];
2731: sum5 = tmp[row - 4];
2732: v2 = aa + ad[row] + 1;
2733: v3 = aa + ad[row - 1] + 1;
2734: v4 = aa + ad[row - 2] + 1;
2735: v5 = aa + ad[row - 3] + 1;
2736: for (j = 0; j < nz - 1; j += 2) {
2737: i0 = vi[j];
2738: i1 = vi[j + 1];
2739: tmp0 = tmps[i0];
2740: tmp1 = tmps[i1];
2741: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2742: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2743: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2744: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2745: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2746: }
2747: if (j == nz - 1) {
2748: tmp0 = tmps[vi[j]];
2749: sum1 -= v1[j] * tmp0;
2750: sum2 -= v2[j + 1] * tmp0;
2751: sum3 -= v3[j + 2] * tmp0;
2752: sum4 -= v4[j + 3] * tmp0;
2753: sum5 -= v5[j + 4] * tmp0;
2754: }
2756: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2757: row--;
2758: sum2 -= v2[0] * tmp0;
2759: sum3 -= v3[1] * tmp0;
2760: sum4 -= v4[2] * tmp0;
2761: sum5 -= v5[3] * tmp0;
2762: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2763: row--;
2764: sum3 -= v3[0] * tmp0;
2765: sum4 -= v4[1] * tmp0;
2766: sum5 -= v5[2] * tmp0;
2767: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2768: row--;
2769: sum4 -= v4[0] * tmp0;
2770: sum5 -= v5[1] * tmp0;
2771: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2772: row--;
2773: sum5 -= v5[0] * tmp0;
2774: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2775: row--;
2776: break;
2777: default:
2778: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2779: }
2780: }
2781: PetscCall(ISRestoreIndices(isrow, &rout));
2782: PetscCall(ISRestoreIndices(iscol, &cout));
2783: PetscCall(VecRestoreArrayRead(bb, &b));
2784: PetscCall(VecRestoreArrayWrite(xx, &x));
2785: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2786: PetscFunctionReturn(PETSC_SUCCESS);
2787: }
2789: /*
2790: Makes a longer coloring[] array and calls the usual code with that
2791: */
2792: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2793: {
2794: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2795: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2796: PetscInt *colorused, i;
2797: ISColoringValue *newcolor;
2799: PetscFunctionBegin;
2800: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2801: PetscCall(PetscMalloc1(n + 1, &newcolor));
2802: /* loop over inodes, marking a color for each column*/
2803: row = 0;
2804: for (i = 0; i < m; i++) {
2805: for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2806: }
2808: /* eliminate unneeded colors */
2809: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2810: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2812: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2813: ncolors = colorused[5 * ncolors - 1];
2814: for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2815: PetscCall(PetscFree(colorused));
2816: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2817: PetscCall(PetscFree(coloring));
2818: PetscFunctionReturn(PETSC_SUCCESS);
2819: }
2821: #include <petsc/private/kernels/blockinvert.h>
2823: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2824: {
2825: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2826: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2827: MatScalar *ibdiag, *bdiag, work[25], *t;
2828: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2829: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2830: const PetscScalar *xb, *b;
2831: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2832: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2833: PetscInt sz, k, ipvt[5];
2834: PetscBool allowzeropivot, zeropivotdetected;
2835: const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2837: PetscFunctionBegin;
2838: allowzeropivot = PetscNot(A->erroriffailure);
2839: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2840: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2841: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2843: if (!a->inode.ibdiagvalid) {
2844: if (!a->inode.ibdiag) {
2845: /* calculate space needed for diagonal blocks */
2846: for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2847: a->inode.bdiagsize = cnt;
2849: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2850: }
2852: /* copy over the diagonal blocks and invert them */
2853: ibdiag = a->inode.ibdiag;
2854: bdiag = a->inode.bdiag;
2855: cnt = 0;
2856: for (i = 0, row = 0; i < m; i++) {
2857: for (j = 0; j < sizes[i]; j++) {
2858: for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2859: }
2860: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2862: switch (sizes[i]) {
2863: case 1:
2864: /* Create matrix data structure */
2865: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2866: if (allowzeropivot) {
2867: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2868: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2869: A->factorerror_zeropivot_row = row;
2870: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2871: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2872: }
2873: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2874: break;
2875: case 2:
2876: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2877: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2878: break;
2879: case 3:
2880: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2881: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2882: break;
2883: case 4:
2884: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2885: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2886: break;
2887: case 5:
2888: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2889: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2890: break;
2891: default:
2892: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2893: }
2894: cnt += sizes[i] * sizes[i];
2895: row += sizes[i];
2896: }
2897: a->inode.ibdiagvalid = PETSC_TRUE;
2898: }
2899: ibdiag = a->inode.ibdiag;
2900: bdiag = a->inode.bdiag;
2901: t = a->inode.ssor_work;
2903: PetscCall(VecGetArray(xx, &x));
2904: PetscCall(VecGetArrayRead(bb, &b));
2905: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2906: if (flag & SOR_ZERO_INITIAL_GUESS) {
2907: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2908: for (i = 0, row = 0; i < m; i++) {
2909: sz = diag[row] - ii[row];
2910: v1 = a->a + ii[row];
2911: idx = a->j + ii[row];
2913: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2914: switch (sizes[i]) {
2915: case 1:
2917: sum1 = b[row];
2918: for (n = 0; n < sz - 1; n += 2) {
2919: i1 = idx[0];
2920: i2 = idx[1];
2921: idx += 2;
2922: tmp0 = x[i1];
2923: tmp1 = x[i2];
2924: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2925: v1 += 2;
2926: }
2928: if (n == sz - 1) {
2929: tmp0 = x[*idx];
2930: sum1 -= *v1 * tmp0;
2931: }
2932: t[row] = sum1;
2933: x[row++] = sum1 * (*ibdiag++);
2934: break;
2935: case 2:
2936: v2 = a->a + ii[row + 1];
2937: sum1 = b[row];
2938: sum2 = b[row + 1];
2939: for (n = 0; n < sz - 1; n += 2) {
2940: i1 = idx[0];
2941: i2 = idx[1];
2942: idx += 2;
2943: tmp0 = x[i1];
2944: tmp1 = x[i2];
2945: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946: v1 += 2;
2947: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2948: v2 += 2;
2949: }
2951: if (n == sz - 1) {
2952: tmp0 = x[*idx];
2953: sum1 -= v1[0] * tmp0;
2954: sum2 -= v2[0] * tmp0;
2955: }
2956: t[row] = sum1;
2957: t[row + 1] = sum2;
2958: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2959: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2960: ibdiag += 4;
2961: break;
2962: case 3:
2963: v2 = a->a + ii[row + 1];
2964: v3 = a->a + ii[row + 2];
2965: sum1 = b[row];
2966: sum2 = b[row + 1];
2967: sum3 = b[row + 2];
2968: for (n = 0; n < sz - 1; n += 2) {
2969: i1 = idx[0];
2970: i2 = idx[1];
2971: idx += 2;
2972: tmp0 = x[i1];
2973: tmp1 = x[i2];
2974: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2975: v1 += 2;
2976: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2977: v2 += 2;
2978: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2979: v3 += 2;
2980: }
2982: if (n == sz - 1) {
2983: tmp0 = x[*idx];
2984: sum1 -= v1[0] * tmp0;
2985: sum2 -= v2[0] * tmp0;
2986: sum3 -= v3[0] * tmp0;
2987: }
2988: t[row] = sum1;
2989: t[row + 1] = sum2;
2990: t[row + 2] = sum3;
2991: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2992: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2993: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2994: ibdiag += 9;
2995: break;
2996: case 4:
2997: v2 = a->a + ii[row + 1];
2998: v3 = a->a + ii[row + 2];
2999: v4 = a->a + ii[row + 3];
3000: sum1 = b[row];
3001: sum2 = b[row + 1];
3002: sum3 = b[row + 2];
3003: sum4 = b[row + 3];
3004: for (n = 0; n < sz - 1; n += 2) {
3005: i1 = idx[0];
3006: i2 = idx[1];
3007: idx += 2;
3008: tmp0 = x[i1];
3009: tmp1 = x[i2];
3010: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3011: v1 += 2;
3012: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3013: v2 += 2;
3014: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3015: v3 += 2;
3016: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3017: v4 += 2;
3018: }
3020: if (n == sz - 1) {
3021: tmp0 = x[*idx];
3022: sum1 -= v1[0] * tmp0;
3023: sum2 -= v2[0] * tmp0;
3024: sum3 -= v3[0] * tmp0;
3025: sum4 -= v4[0] * tmp0;
3026: }
3027: t[row] = sum1;
3028: t[row + 1] = sum2;
3029: t[row + 2] = sum3;
3030: t[row + 3] = sum4;
3031: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3032: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3033: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3034: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3035: ibdiag += 16;
3036: break;
3037: case 5:
3038: v2 = a->a + ii[row + 1];
3039: v3 = a->a + ii[row + 2];
3040: v4 = a->a + ii[row + 3];
3041: v5 = a->a + ii[row + 4];
3042: sum1 = b[row];
3043: sum2 = b[row + 1];
3044: sum3 = b[row + 2];
3045: sum4 = b[row + 3];
3046: sum5 = b[row + 4];
3047: for (n = 0; n < sz - 1; n += 2) {
3048: i1 = idx[0];
3049: i2 = idx[1];
3050: idx += 2;
3051: tmp0 = x[i1];
3052: tmp1 = x[i2];
3053: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3054: v1 += 2;
3055: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3056: v2 += 2;
3057: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3058: v3 += 2;
3059: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3060: v4 += 2;
3061: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3062: v5 += 2;
3063: }
3065: if (n == sz - 1) {
3066: tmp0 = x[*idx];
3067: sum1 -= v1[0] * tmp0;
3068: sum2 -= v2[0] * tmp0;
3069: sum3 -= v3[0] * tmp0;
3070: sum4 -= v4[0] * tmp0;
3071: sum5 -= v5[0] * tmp0;
3072: }
3073: t[row] = sum1;
3074: t[row + 1] = sum2;
3075: t[row + 2] = sum3;
3076: t[row + 3] = sum4;
3077: t[row + 4] = sum5;
3078: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3079: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3080: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3081: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3082: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3083: ibdiag += 25;
3084: break;
3085: default:
3086: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3087: }
3088: }
3090: xb = t;
3091: PetscCall(PetscLogFlops(a->nz));
3092: } else xb = b;
3093: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3094: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3095: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3096: ibdiag -= sizes[i] * sizes[i];
3097: sz = ii[row + 1] - diag[row] - 1;
3098: v1 = a->a + diag[row] + 1;
3099: idx = a->j + diag[row] + 1;
3101: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3102: switch (sizes[i]) {
3103: case 1:
3105: sum1 = xb[row];
3106: for (n = 0; n < sz - 1; n += 2) {
3107: i1 = idx[0];
3108: i2 = idx[1];
3109: idx += 2;
3110: tmp0 = x[i1];
3111: tmp1 = x[i2];
3112: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3113: v1 += 2;
3114: }
3116: if (n == sz - 1) {
3117: tmp0 = x[*idx];
3118: sum1 -= *v1 * tmp0;
3119: }
3120: x[row--] = sum1 * (*ibdiag);
3121: break;
3123: case 2:
3125: sum1 = xb[row];
3126: sum2 = xb[row - 1];
3127: /* note that sum1 is associated with the second of the two rows */
3128: v2 = a->a + diag[row - 1] + 2;
3129: for (n = 0; n < sz - 1; n += 2) {
3130: i1 = idx[0];
3131: i2 = idx[1];
3132: idx += 2;
3133: tmp0 = x[i1];
3134: tmp1 = x[i2];
3135: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3136: v1 += 2;
3137: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3138: v2 += 2;
3139: }
3141: if (n == sz - 1) {
3142: tmp0 = x[*idx];
3143: sum1 -= *v1 * tmp0;
3144: sum2 -= *v2 * tmp0;
3145: }
3146: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3147: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3148: break;
3149: case 3:
3151: sum1 = xb[row];
3152: sum2 = xb[row - 1];
3153: sum3 = xb[row - 2];
3154: v2 = a->a + diag[row - 1] + 2;
3155: v3 = a->a + diag[row - 2] + 3;
3156: for (n = 0; n < sz - 1; n += 2) {
3157: i1 = idx[0];
3158: i2 = idx[1];
3159: idx += 2;
3160: tmp0 = x[i1];
3161: tmp1 = x[i2];
3162: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3163: v1 += 2;
3164: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3165: v2 += 2;
3166: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3167: v3 += 2;
3168: }
3170: if (n == sz - 1) {
3171: tmp0 = x[*idx];
3172: sum1 -= *v1 * tmp0;
3173: sum2 -= *v2 * tmp0;
3174: sum3 -= *v3 * tmp0;
3175: }
3176: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3177: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3178: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3179: break;
3180: case 4:
3182: sum1 = xb[row];
3183: sum2 = xb[row - 1];
3184: sum3 = xb[row - 2];
3185: sum4 = xb[row - 3];
3186: v2 = a->a + diag[row - 1] + 2;
3187: v3 = a->a + diag[row - 2] + 3;
3188: v4 = a->a + diag[row - 3] + 4;
3189: for (n = 0; n < sz - 1; n += 2) {
3190: i1 = idx[0];
3191: i2 = idx[1];
3192: idx += 2;
3193: tmp0 = x[i1];
3194: tmp1 = x[i2];
3195: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3196: v1 += 2;
3197: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3198: v2 += 2;
3199: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3200: v3 += 2;
3201: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3202: v4 += 2;
3203: }
3205: if (n == sz - 1) {
3206: tmp0 = x[*idx];
3207: sum1 -= *v1 * tmp0;
3208: sum2 -= *v2 * tmp0;
3209: sum3 -= *v3 * tmp0;
3210: sum4 -= *v4 * tmp0;
3211: }
3212: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3213: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3214: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3215: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3216: break;
3217: case 5:
3219: sum1 = xb[row];
3220: sum2 = xb[row - 1];
3221: sum3 = xb[row - 2];
3222: sum4 = xb[row - 3];
3223: sum5 = xb[row - 4];
3224: v2 = a->a + diag[row - 1] + 2;
3225: v3 = a->a + diag[row - 2] + 3;
3226: v4 = a->a + diag[row - 3] + 4;
3227: v5 = a->a + diag[row - 4] + 5;
3228: for (n = 0; n < sz - 1; n += 2) {
3229: i1 = idx[0];
3230: i2 = idx[1];
3231: idx += 2;
3232: tmp0 = x[i1];
3233: tmp1 = x[i2];
3234: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3235: v1 += 2;
3236: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3237: v2 += 2;
3238: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3239: v3 += 2;
3240: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3241: v4 += 2;
3242: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3243: v5 += 2;
3244: }
3246: if (n == sz - 1) {
3247: tmp0 = x[*idx];
3248: sum1 -= *v1 * tmp0;
3249: sum2 -= *v2 * tmp0;
3250: sum3 -= *v3 * tmp0;
3251: sum4 -= *v4 * tmp0;
3252: sum5 -= *v5 * tmp0;
3253: }
3254: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3255: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3256: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3257: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3258: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3259: break;
3260: default:
3261: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3262: }
3263: }
3265: PetscCall(PetscLogFlops(a->nz));
3266: }
3267: its--;
3268: }
3269: while (its--) {
3270: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3271: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3272: sz = diag[row] - ii[row];
3273: v1 = a->a + ii[row];
3274: idx = a->j + ii[row];
3275: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3276: switch (sizes[i]) {
3277: case 1:
3278: sum1 = b[row];
3279: for (n = 0; n < sz - 1; n += 2) {
3280: i1 = idx[0];
3281: i2 = idx[1];
3282: idx += 2;
3283: tmp0 = x[i1];
3284: tmp1 = x[i2];
3285: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3286: v1 += 2;
3287: }
3288: if (n == sz - 1) {
3289: tmp0 = x[*idx++];
3290: sum1 -= *v1 * tmp0;
3291: v1++;
3292: }
3293: t[row] = sum1;
3294: sz = ii[row + 1] - diag[row] - 1;
3295: idx = a->j + diag[row] + 1;
3296: v1 += 1;
3297: for (n = 0; n < sz - 1; n += 2) {
3298: i1 = idx[0];
3299: i2 = idx[1];
3300: idx += 2;
3301: tmp0 = x[i1];
3302: tmp1 = x[i2];
3303: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3304: v1 += 2;
3305: }
3306: if (n == sz - 1) {
3307: tmp0 = x[*idx++];
3308: sum1 -= *v1 * tmp0;
3309: }
3310: /* in MatSOR_SeqAIJ this line would be
3311: *
3312: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3313: *
3314: * but omega == 1, so this becomes
3315: *
3316: * x[row] = sum1*(*ibdiag++);
3317: *
3318: */
3319: x[row] = sum1 * (*ibdiag);
3320: break;
3321: case 2:
3322: v2 = a->a + ii[row + 1];
3323: sum1 = b[row];
3324: sum2 = b[row + 1];
3325: for (n = 0; n < sz - 1; n += 2) {
3326: i1 = idx[0];
3327: i2 = idx[1];
3328: idx += 2;
3329: tmp0 = x[i1];
3330: tmp1 = x[i2];
3331: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3332: v1 += 2;
3333: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3334: v2 += 2;
3335: }
3336: if (n == sz - 1) {
3337: tmp0 = x[*idx++];
3338: sum1 -= v1[0] * tmp0;
3339: sum2 -= v2[0] * tmp0;
3340: v1++;
3341: v2++;
3342: }
3343: t[row] = sum1;
3344: t[row + 1] = sum2;
3345: sz = ii[row + 1] - diag[row] - 2;
3346: idx = a->j + diag[row] + 2;
3347: v1 += 2;
3348: v2 += 2;
3349: for (n = 0; n < sz - 1; n += 2) {
3350: i1 = idx[0];
3351: i2 = idx[1];
3352: idx += 2;
3353: tmp0 = x[i1];
3354: tmp1 = x[i2];
3355: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3356: v1 += 2;
3357: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3358: v2 += 2;
3359: }
3360: if (n == sz - 1) {
3361: tmp0 = x[*idx];
3362: sum1 -= v1[0] * tmp0;
3363: sum2 -= v2[0] * tmp0;
3364: }
3365: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3366: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3367: break;
3368: case 3:
3369: v2 = a->a + ii[row + 1];
3370: v3 = a->a + ii[row + 2];
3371: sum1 = b[row];
3372: sum2 = b[row + 1];
3373: sum3 = b[row + 2];
3374: for (n = 0; n < sz - 1; n += 2) {
3375: i1 = idx[0];
3376: i2 = idx[1];
3377: idx += 2;
3378: tmp0 = x[i1];
3379: tmp1 = x[i2];
3380: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3381: v1 += 2;
3382: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3383: v2 += 2;
3384: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3385: v3 += 2;
3386: }
3387: if (n == sz - 1) {
3388: tmp0 = x[*idx++];
3389: sum1 -= v1[0] * tmp0;
3390: sum2 -= v2[0] * tmp0;
3391: sum3 -= v3[0] * tmp0;
3392: v1++;
3393: v2++;
3394: v3++;
3395: }
3396: t[row] = sum1;
3397: t[row + 1] = sum2;
3398: t[row + 2] = sum3;
3399: sz = ii[row + 1] - diag[row] - 3;
3400: idx = a->j + diag[row] + 3;
3401: v1 += 3;
3402: v2 += 3;
3403: v3 += 3;
3404: for (n = 0; n < sz - 1; n += 2) {
3405: i1 = idx[0];
3406: i2 = idx[1];
3407: idx += 2;
3408: tmp0 = x[i1];
3409: tmp1 = x[i2];
3410: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3411: v1 += 2;
3412: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3413: v2 += 2;
3414: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3415: v3 += 2;
3416: }
3417: if (n == sz - 1) {
3418: tmp0 = x[*idx];
3419: sum1 -= v1[0] * tmp0;
3420: sum2 -= v2[0] * tmp0;
3421: sum3 -= v3[0] * tmp0;
3422: }
3423: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3424: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3425: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3426: break;
3427: case 4:
3428: v2 = a->a + ii[row + 1];
3429: v3 = a->a + ii[row + 2];
3430: v4 = a->a + ii[row + 3];
3431: sum1 = b[row];
3432: sum2 = b[row + 1];
3433: sum3 = b[row + 2];
3434: sum4 = b[row + 3];
3435: for (n = 0; n < sz - 1; n += 2) {
3436: i1 = idx[0];
3437: i2 = idx[1];
3438: idx += 2;
3439: tmp0 = x[i1];
3440: tmp1 = x[i2];
3441: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3442: v1 += 2;
3443: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3444: v2 += 2;
3445: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3446: v3 += 2;
3447: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3448: v4 += 2;
3449: }
3450: if (n == sz - 1) {
3451: tmp0 = x[*idx++];
3452: sum1 -= v1[0] * tmp0;
3453: sum2 -= v2[0] * tmp0;
3454: sum3 -= v3[0] * tmp0;
3455: sum4 -= v4[0] * tmp0;
3456: v1++;
3457: v2++;
3458: v3++;
3459: v4++;
3460: }
3461: t[row] = sum1;
3462: t[row + 1] = sum2;
3463: t[row + 2] = sum3;
3464: t[row + 3] = sum4;
3465: sz = ii[row + 1] - diag[row] - 4;
3466: idx = a->j + diag[row] + 4;
3467: v1 += 4;
3468: v2 += 4;
3469: v3 += 4;
3470: v4 += 4;
3471: for (n = 0; n < sz - 1; n += 2) {
3472: i1 = idx[0];
3473: i2 = idx[1];
3474: idx += 2;
3475: tmp0 = x[i1];
3476: tmp1 = x[i2];
3477: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3478: v1 += 2;
3479: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3480: v2 += 2;
3481: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3482: v3 += 2;
3483: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3484: v4 += 2;
3485: }
3486: if (n == sz - 1) {
3487: tmp0 = x[*idx];
3488: sum1 -= v1[0] * tmp0;
3489: sum2 -= v2[0] * tmp0;
3490: sum3 -= v3[0] * tmp0;
3491: sum4 -= v4[0] * tmp0;
3492: }
3493: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3494: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3495: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3496: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3497: break;
3498: case 5:
3499: v2 = a->a + ii[row + 1];
3500: v3 = a->a + ii[row + 2];
3501: v4 = a->a + ii[row + 3];
3502: v5 = a->a + ii[row + 4];
3503: sum1 = b[row];
3504: sum2 = b[row + 1];
3505: sum3 = b[row + 2];
3506: sum4 = b[row + 3];
3507: sum5 = b[row + 4];
3508: for (n = 0; n < sz - 1; n += 2) {
3509: i1 = idx[0];
3510: i2 = idx[1];
3511: idx += 2;
3512: tmp0 = x[i1];
3513: tmp1 = x[i2];
3514: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3515: v1 += 2;
3516: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3517: v2 += 2;
3518: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3519: v3 += 2;
3520: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3521: v4 += 2;
3522: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3523: v5 += 2;
3524: }
3525: if (n == sz - 1) {
3526: tmp0 = x[*idx++];
3527: sum1 -= v1[0] * tmp0;
3528: sum2 -= v2[0] * tmp0;
3529: sum3 -= v3[0] * tmp0;
3530: sum4 -= v4[0] * tmp0;
3531: sum5 -= v5[0] * tmp0;
3532: v1++;
3533: v2++;
3534: v3++;
3535: v4++;
3536: v5++;
3537: }
3538: t[row] = sum1;
3539: t[row + 1] = sum2;
3540: t[row + 2] = sum3;
3541: t[row + 3] = sum4;
3542: t[row + 4] = sum5;
3543: sz = ii[row + 1] - diag[row] - 5;
3544: idx = a->j + diag[row] + 5;
3545: v1 += 5;
3546: v2 += 5;
3547: v3 += 5;
3548: v4 += 5;
3549: v5 += 5;
3550: for (n = 0; n < sz - 1; n += 2) {
3551: i1 = idx[0];
3552: i2 = idx[1];
3553: idx += 2;
3554: tmp0 = x[i1];
3555: tmp1 = x[i2];
3556: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3557: v1 += 2;
3558: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3559: v2 += 2;
3560: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3561: v3 += 2;
3562: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3563: v4 += 2;
3564: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3565: v5 += 2;
3566: }
3567: if (n == sz - 1) {
3568: tmp0 = x[*idx];
3569: sum1 -= v1[0] * tmp0;
3570: sum2 -= v2[0] * tmp0;
3571: sum3 -= v3[0] * tmp0;
3572: sum4 -= v4[0] * tmp0;
3573: sum5 -= v5[0] * tmp0;
3574: }
3575: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3576: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3577: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3578: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3579: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3580: break;
3581: default:
3582: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3583: }
3584: }
3585: xb = t;
3586: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3587: } else xb = b;
3589: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3590: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3591: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3592: ibdiag -= sizes[i] * sizes[i];
3594: /* set RHS */
3595: if (xb == b) {
3596: /* whole (old way) */
3597: sz = ii[row + 1] - ii[row];
3598: idx = a->j + ii[row];
3599: switch (sizes[i]) {
3600: case 5:
3601: v5 = a->a + ii[row - 4]; /* fall through */
3602: case 4:
3603: v4 = a->a + ii[row - 3]; /* fall through */
3604: case 3:
3605: v3 = a->a + ii[row - 2]; /* fall through */
3606: case 2:
3607: v2 = a->a + ii[row - 1]; /* fall through */
3608: case 1:
3609: v1 = a->a + ii[row];
3610: break;
3611: default:
3612: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3613: }
3614: } else {
3615: /* upper, no diag */
3616: sz = ii[row + 1] - diag[row] - 1;
3617: idx = a->j + diag[row] + 1;
3618: switch (sizes[i]) {
3619: case 5:
3620: v5 = a->a + diag[row - 4] + 5; /* fall through */
3621: case 4:
3622: v4 = a->a + diag[row - 3] + 4; /* fall through */
3623: case 3:
3624: v3 = a->a + diag[row - 2] + 3; /* fall through */
3625: case 2:
3626: v2 = a->a + diag[row - 1] + 2; /* fall through */
3627: case 1:
3628: v1 = a->a + diag[row] + 1;
3629: }
3630: }
3631: /* set sum */
3632: switch (sizes[i]) {
3633: case 5:
3634: sum5 = xb[row - 4]; /* fall through */
3635: case 4:
3636: sum4 = xb[row - 3]; /* fall through */
3637: case 3:
3638: sum3 = xb[row - 2]; /* fall through */
3639: case 2:
3640: sum2 = xb[row - 1]; /* fall through */
3641: case 1:
3642: /* note that sum1 is associated with the last row */
3643: sum1 = xb[row];
3644: }
3645: /* do sums */
3646: for (n = 0; n < sz - 1; n += 2) {
3647: i1 = idx[0];
3648: i2 = idx[1];
3649: idx += 2;
3650: tmp0 = x[i1];
3651: tmp1 = x[i2];
3652: switch (sizes[i]) {
3653: case 5:
3654: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3655: v5 += 2; /* fall through */
3656: case 4:
3657: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3658: v4 += 2; /* fall through */
3659: case 3:
3660: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3661: v3 += 2; /* fall through */
3662: case 2:
3663: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3664: v2 += 2; /* fall through */
3665: case 1:
3666: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3667: v1 += 2;
3668: }
3669: }
3670: /* ragged edge */
3671: if (n == sz - 1) {
3672: tmp0 = x[*idx];
3673: switch (sizes[i]) {
3674: case 5:
3675: sum5 -= *v5 * tmp0; /* fall through */
3676: case 4:
3677: sum4 -= *v4 * tmp0; /* fall through */
3678: case 3:
3679: sum3 -= *v3 * tmp0; /* fall through */
3680: case 2:
3681: sum2 -= *v2 * tmp0; /* fall through */
3682: case 1:
3683: sum1 -= *v1 * tmp0;
3684: }
3685: }
3686: /* update */
3687: if (xb == b) {
3688: /* whole (old way) w/ diag */
3689: switch (sizes[i]) {
3690: case 5:
3691: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3692: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3693: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3694: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3695: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3696: break;
3697: case 4:
3698: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3699: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3700: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3701: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3702: break;
3703: case 3:
3704: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3705: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3706: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3707: break;
3708: case 2:
3709: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3710: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3711: break;
3712: case 1:
3713: x[row--] += sum1 * (*ibdiag);
3714: break;
3715: }
3716: } else {
3717: /* no diag so set = */
3718: switch (sizes[i]) {
3719: case 5:
3720: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3721: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3722: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3723: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3724: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3725: break;
3726: case 4:
3727: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3728: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3729: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3730: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3731: break;
3732: case 3:
3733: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3734: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3735: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3736: break;
3737: case 2:
3738: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3739: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3740: break;
3741: case 1:
3742: x[row--] = sum1 * (*ibdiag);
3743: break;
3744: }
3745: }
3746: }
3747: if (xb == b) {
3748: PetscCall(PetscLogFlops(2.0 * a->nz));
3749: } else {
3750: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3751: }
3752: }
3753: }
3754: if (flag & SOR_EISENSTAT) {
3755: /*
3756: Apply (U + D)^-1 where D is now the block diagonal
3757: */
3758: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3759: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3760: ibdiag -= sizes[i] * sizes[i];
3761: sz = ii[row + 1] - diag[row] - 1;
3762: v1 = a->a + diag[row] + 1;
3763: idx = a->j + diag[row] + 1;
3764: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3765: switch (sizes[i]) {
3766: case 1:
3768: sum1 = b[row];
3769: for (n = 0; n < sz - 1; n += 2) {
3770: i1 = idx[0];
3771: i2 = idx[1];
3772: idx += 2;
3773: tmp0 = x[i1];
3774: tmp1 = x[i2];
3775: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3776: v1 += 2;
3777: }
3779: if (n == sz - 1) {
3780: tmp0 = x[*idx];
3781: sum1 -= *v1 * tmp0;
3782: }
3783: x[row] = sum1 * (*ibdiag);
3784: row--;
3785: break;
3787: case 2:
3789: sum1 = b[row];
3790: sum2 = b[row - 1];
3791: /* note that sum1 is associated with the second of the two rows */
3792: v2 = a->a + diag[row - 1] + 2;
3793: for (n = 0; n < sz - 1; n += 2) {
3794: i1 = idx[0];
3795: i2 = idx[1];
3796: idx += 2;
3797: tmp0 = x[i1];
3798: tmp1 = x[i2];
3799: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3800: v1 += 2;
3801: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3802: v2 += 2;
3803: }
3805: if (n == sz - 1) {
3806: tmp0 = x[*idx];
3807: sum1 -= *v1 * tmp0;
3808: sum2 -= *v2 * tmp0;
3809: }
3810: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3811: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3812: row -= 2;
3813: break;
3814: case 3:
3816: sum1 = b[row];
3817: sum2 = b[row - 1];
3818: sum3 = b[row - 2];
3819: v2 = a->a + diag[row - 1] + 2;
3820: v3 = a->a + diag[row - 2] + 3;
3821: for (n = 0; n < sz - 1; n += 2) {
3822: i1 = idx[0];
3823: i2 = idx[1];
3824: idx += 2;
3825: tmp0 = x[i1];
3826: tmp1 = x[i2];
3827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3828: v1 += 2;
3829: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3830: v2 += 2;
3831: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3832: v3 += 2;
3833: }
3835: if (n == sz - 1) {
3836: tmp0 = x[*idx];
3837: sum1 -= *v1 * tmp0;
3838: sum2 -= *v2 * tmp0;
3839: sum3 -= *v3 * tmp0;
3840: }
3841: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3842: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3843: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3844: row -= 3;
3845: break;
3846: case 4:
3848: sum1 = b[row];
3849: sum2 = b[row - 1];
3850: sum3 = b[row - 2];
3851: sum4 = b[row - 3];
3852: v2 = a->a + diag[row - 1] + 2;
3853: v3 = a->a + diag[row - 2] + 3;
3854: v4 = a->a + diag[row - 3] + 4;
3855: for (n = 0; n < sz - 1; n += 2) {
3856: i1 = idx[0];
3857: i2 = idx[1];
3858: idx += 2;
3859: tmp0 = x[i1];
3860: tmp1 = x[i2];
3861: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3862: v1 += 2;
3863: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3864: v2 += 2;
3865: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3866: v3 += 2;
3867: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3868: v4 += 2;
3869: }
3871: if (n == sz - 1) {
3872: tmp0 = x[*idx];
3873: sum1 -= *v1 * tmp0;
3874: sum2 -= *v2 * tmp0;
3875: sum3 -= *v3 * tmp0;
3876: sum4 -= *v4 * tmp0;
3877: }
3878: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3879: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3880: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3881: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3882: row -= 4;
3883: break;
3884: case 5:
3886: sum1 = b[row];
3887: sum2 = b[row - 1];
3888: sum3 = b[row - 2];
3889: sum4 = b[row - 3];
3890: sum5 = b[row - 4];
3891: v2 = a->a + diag[row - 1] + 2;
3892: v3 = a->a + diag[row - 2] + 3;
3893: v4 = a->a + diag[row - 3] + 4;
3894: v5 = a->a + diag[row - 4] + 5;
3895: for (n = 0; n < sz - 1; n += 2) {
3896: i1 = idx[0];
3897: i2 = idx[1];
3898: idx += 2;
3899: tmp0 = x[i1];
3900: tmp1 = x[i2];
3901: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3902: v1 += 2;
3903: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3904: v2 += 2;
3905: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3906: v3 += 2;
3907: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3908: v4 += 2;
3909: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3910: v5 += 2;
3911: }
3913: if (n == sz - 1) {
3914: tmp0 = x[*idx];
3915: sum1 -= *v1 * tmp0;
3916: sum2 -= *v2 * tmp0;
3917: sum3 -= *v3 * tmp0;
3918: sum4 -= *v4 * tmp0;
3919: sum5 -= *v5 * tmp0;
3920: }
3921: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3922: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3923: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3924: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3925: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3926: row -= 5;
3927: break;
3928: default:
3929: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3930: }
3931: }
3932: PetscCall(PetscLogFlops(a->nz));
3934: /*
3935: t = b - D x where D is the block diagonal
3936: */
3937: cnt = 0;
3938: for (i = 0, row = 0; i < m; i++) {
3939: switch (sizes[i]) {
3940: case 1:
3941: t[row] = b[row] - bdiag[cnt++] * x[row];
3942: row++;
3943: break;
3944: case 2:
3945: x1 = x[row];
3946: x2 = x[row + 1];
3947: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3948: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3949: t[row] = b[row] - tmp1;
3950: t[row + 1] = b[row + 1] - tmp2;
3951: row += 2;
3952: cnt += 4;
3953: break;
3954: case 3:
3955: x1 = x[row];
3956: x2 = x[row + 1];
3957: x3 = x[row + 2];
3958: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3959: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3960: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3961: t[row] = b[row] - tmp1;
3962: t[row + 1] = b[row + 1] - tmp2;
3963: t[row + 2] = b[row + 2] - tmp3;
3964: row += 3;
3965: cnt += 9;
3966: break;
3967: case 4:
3968: x1 = x[row];
3969: x2 = x[row + 1];
3970: x3 = x[row + 2];
3971: x4 = x[row + 3];
3972: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3973: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3974: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3975: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3976: t[row] = b[row] - tmp1;
3977: t[row + 1] = b[row + 1] - tmp2;
3978: t[row + 2] = b[row + 2] - tmp3;
3979: t[row + 3] = b[row + 3] - tmp4;
3980: row += 4;
3981: cnt += 16;
3982: break;
3983: case 5:
3984: x1 = x[row];
3985: x2 = x[row + 1];
3986: x3 = x[row + 2];
3987: x4 = x[row + 3];
3988: x5 = x[row + 4];
3989: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3990: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3991: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3992: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3993: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3994: t[row] = b[row] - tmp1;
3995: t[row + 1] = b[row + 1] - tmp2;
3996: t[row + 2] = b[row + 2] - tmp3;
3997: t[row + 3] = b[row + 3] - tmp4;
3998: t[row + 4] = b[row + 4] - tmp5;
3999: row += 5;
4000: cnt += 25;
4001: break;
4002: default:
4003: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4004: }
4005: }
4006: PetscCall(PetscLogFlops(m));
4008: /*
4009: Apply (L + D)^-1 where D is the block diagonal
4010: */
4011: for (i = 0, row = 0; i < m; i++) {
4012: sz = diag[row] - ii[row];
4013: v1 = a->a + ii[row];
4014: idx = a->j + ii[row];
4015: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4016: switch (sizes[i]) {
4017: case 1:
4019: sum1 = t[row];
4020: for (n = 0; n < sz - 1; n += 2) {
4021: i1 = idx[0];
4022: i2 = idx[1];
4023: idx += 2;
4024: tmp0 = t[i1];
4025: tmp1 = t[i2];
4026: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4027: v1 += 2;
4028: }
4030: if (n == sz - 1) {
4031: tmp0 = t[*idx];
4032: sum1 -= *v1 * tmp0;
4033: }
4034: x[row] += t[row] = sum1 * (*ibdiag++);
4035: row++;
4036: break;
4037: case 2:
4038: v2 = a->a + ii[row + 1];
4039: sum1 = t[row];
4040: sum2 = t[row + 1];
4041: for (n = 0; n < sz - 1; n += 2) {
4042: i1 = idx[0];
4043: i2 = idx[1];
4044: idx += 2;
4045: tmp0 = t[i1];
4046: tmp1 = t[i2];
4047: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4048: v1 += 2;
4049: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4050: v2 += 2;
4051: }
4053: if (n == sz - 1) {
4054: tmp0 = t[*idx];
4055: sum1 -= v1[0] * tmp0;
4056: sum2 -= v2[0] * tmp0;
4057: }
4058: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4059: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4060: ibdiag += 4;
4061: row += 2;
4062: break;
4063: case 3:
4064: v2 = a->a + ii[row + 1];
4065: v3 = a->a + ii[row + 2];
4066: sum1 = t[row];
4067: sum2 = t[row + 1];
4068: sum3 = t[row + 2];
4069: for (n = 0; n < sz - 1; n += 2) {
4070: i1 = idx[0];
4071: i2 = idx[1];
4072: idx += 2;
4073: tmp0 = t[i1];
4074: tmp1 = t[i2];
4075: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4076: v1 += 2;
4077: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4078: v2 += 2;
4079: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4080: v3 += 2;
4081: }
4083: if (n == sz - 1) {
4084: tmp0 = t[*idx];
4085: sum1 -= v1[0] * tmp0;
4086: sum2 -= v2[0] * tmp0;
4087: sum3 -= v3[0] * tmp0;
4088: }
4089: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4090: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4091: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4092: ibdiag += 9;
4093: row += 3;
4094: break;
4095: case 4:
4096: v2 = a->a + ii[row + 1];
4097: v3 = a->a + ii[row + 2];
4098: v4 = a->a + ii[row + 3];
4099: sum1 = t[row];
4100: sum2 = t[row + 1];
4101: sum3 = t[row + 2];
4102: sum4 = t[row + 3];
4103: for (n = 0; n < sz - 1; n += 2) {
4104: i1 = idx[0];
4105: i2 = idx[1];
4106: idx += 2;
4107: tmp0 = t[i1];
4108: tmp1 = t[i2];
4109: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4110: v1 += 2;
4111: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4112: v2 += 2;
4113: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4114: v3 += 2;
4115: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4116: v4 += 2;
4117: }
4119: if (n == sz - 1) {
4120: tmp0 = t[*idx];
4121: sum1 -= v1[0] * tmp0;
4122: sum2 -= v2[0] * tmp0;
4123: sum3 -= v3[0] * tmp0;
4124: sum4 -= v4[0] * tmp0;
4125: }
4126: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4127: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4128: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4129: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4130: ibdiag += 16;
4131: row += 4;
4132: break;
4133: case 5:
4134: v2 = a->a + ii[row + 1];
4135: v3 = a->a + ii[row + 2];
4136: v4 = a->a + ii[row + 3];
4137: v5 = a->a + ii[row + 4];
4138: sum1 = t[row];
4139: sum2 = t[row + 1];
4140: sum3 = t[row + 2];
4141: sum4 = t[row + 3];
4142: sum5 = t[row + 4];
4143: for (n = 0; n < sz - 1; n += 2) {
4144: i1 = idx[0];
4145: i2 = idx[1];
4146: idx += 2;
4147: tmp0 = t[i1];
4148: tmp1 = t[i2];
4149: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4150: v1 += 2;
4151: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4152: v2 += 2;
4153: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4154: v3 += 2;
4155: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4156: v4 += 2;
4157: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4158: v5 += 2;
4159: }
4161: if (n == sz - 1) {
4162: tmp0 = t[*idx];
4163: sum1 -= v1[0] * tmp0;
4164: sum2 -= v2[0] * tmp0;
4165: sum3 -= v3[0] * tmp0;
4166: sum4 -= v4[0] * tmp0;
4167: sum5 -= v5[0] * tmp0;
4168: }
4169: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4170: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4171: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4172: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4173: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4174: ibdiag += 25;
4175: row += 5;
4176: break;
4177: default:
4178: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4179: }
4180: }
4181: PetscCall(PetscLogFlops(a->nz));
4182: }
4183: PetscCall(VecRestoreArray(xx, &x));
4184: PetscCall(VecRestoreArrayRead(bb, &b));
4185: PetscFunctionReturn(PETSC_SUCCESS);
4186: }
4188: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4189: {
4190: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4191: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4192: const MatScalar *bdiag = a->inode.bdiag;
4193: const PetscScalar *b;
4194: PetscInt m = a->inode.node_count, cnt = 0, i, row;
4195: const PetscInt *sizes = a->inode.size;
4197: PetscFunctionBegin;
4198: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4199: PetscCall(VecGetArray(xx, &x));
4200: PetscCall(VecGetArrayRead(bb, &b));
4201: cnt = 0;
4202: for (i = 0, row = 0; i < m; i++) {
4203: switch (sizes[i]) {
4204: case 1:
4205: x[row] = b[row] * bdiag[cnt++];
4206: row++;
4207: break;
4208: case 2:
4209: x1 = b[row];
4210: x2 = b[row + 1];
4211: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4212: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4213: x[row++] = tmp1;
4214: x[row++] = tmp2;
4215: cnt += 4;
4216: break;
4217: case 3:
4218: x1 = b[row];
4219: x2 = b[row + 1];
4220: x3 = b[row + 2];
4221: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4222: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4223: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4224: x[row++] = tmp1;
4225: x[row++] = tmp2;
4226: x[row++] = tmp3;
4227: cnt += 9;
4228: break;
4229: case 4:
4230: x1 = b[row];
4231: x2 = b[row + 1];
4232: x3 = b[row + 2];
4233: x4 = b[row + 3];
4234: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4235: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4236: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4237: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4238: x[row++] = tmp1;
4239: x[row++] = tmp2;
4240: x[row++] = tmp3;
4241: x[row++] = tmp4;
4242: cnt += 16;
4243: break;
4244: case 5:
4245: x1 = b[row];
4246: x2 = b[row + 1];
4247: x3 = b[row + 2];
4248: x4 = b[row + 3];
4249: x5 = b[row + 4];
4250: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4251: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4252: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4253: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4254: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4255: x[row++] = tmp1;
4256: x[row++] = tmp2;
4257: x[row++] = tmp3;
4258: x[row++] = tmp4;
4259: x[row++] = tmp5;
4260: cnt += 25;
4261: break;
4262: default:
4263: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4264: }
4265: }
4266: PetscCall(PetscLogFlops(2.0 * cnt));
4267: PetscCall(VecRestoreArray(xx, &x));
4268: PetscCall(VecRestoreArrayRead(bb, &b));
4269: PetscFunctionReturn(PETSC_SUCCESS);
4270: }
4272: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4273: {
4274: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4276: PetscFunctionBegin;
4277: a->inode.node_count = 0;
4278: a->inode.use = PETSC_FALSE;
4279: a->inode.checked = PETSC_FALSE;
4280: a->inode.mat_nonzerostate = -1;
4281: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4282: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4283: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4284: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4285: A->ops->coloringpatch = NULL;
4286: A->ops->multdiagonalblock = NULL;
4287: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4288: PetscFunctionReturn(PETSC_SUCCESS);
4289: }
4291: /*
4292: samestructure indicates that the matrix has not changed its nonzero structure so we
4293: do not need to recompute the inodes
4294: */
4295: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4296: {
4297: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4298: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4299: PetscBool flag;
4300: const PetscInt *idx, *idy, *ii;
4302: PetscFunctionBegin;
4303: if (!a->inode.use) {
4304: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4305: PetscCall(PetscFree(a->inode.size));
4306: PetscFunctionReturn(PETSC_SUCCESS);
4307: }
4308: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4310: m = A->rmap->n;
4311: if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4312: ns = a->inode.size;
4314: i = 0;
4315: node_count = 0;
4316: idx = a->j;
4317: ii = a->i;
4318: if (idx) {
4319: while (i < m) { /* For each row */
4320: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4321: /* Limits the number of elements in a node to 'a->inode.limit' */
4322: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4323: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4324: if (nzy != nzx) break;
4325: idy += nzx; /* Same nonzero pattern */
4326: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4327: if (!flag) break;
4328: }
4329: ns[node_count++] = blk_size;
4330: idx += blk_size * nzx;
4331: i = j;
4332: }
4333: }
4334: /* If not enough inodes found,, do not use inode version of the routines */
4335: if (!m || !idx || node_count > .8 * m) {
4336: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4337: PetscCall(PetscFree(a->inode.size));
4338: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4339: } else {
4340: if (!A->factortype) {
4341: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4342: if (A->rmap->n == A->cmap->n) {
4343: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4344: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4345: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4346: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4347: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4348: }
4349: } else {
4350: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4351: }
4352: a->inode.node_count = node_count;
4353: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4354: }
4355: a->inode.checked = PETSC_TRUE;
4356: a->inode.mat_nonzerostate = A->nonzerostate;
4357: PetscFunctionReturn(PETSC_SUCCESS);
4358: }
4360: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4361: {
4362: Mat B = *C;
4363: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4364: PetscInt m = A->rmap->n;
4366: PetscFunctionBegin;
4367: c->inode.use = a->inode.use;
4368: c->inode.limit = a->inode.limit;
4369: c->inode.max_limit = a->inode.max_limit;
4370: c->inode.checked = PETSC_FALSE;
4371: c->inode.size = NULL;
4372: c->inode.node_count = 0;
4373: c->inode.ibdiagvalid = PETSC_FALSE;
4374: c->inode.ibdiag = NULL;
4375: c->inode.bdiag = NULL;
4376: c->inode.mat_nonzerostate = -1;
4377: if (a->inode.use) {
4378: if (a->inode.checked && a->inode.size) {
4379: PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4380: PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4382: c->inode.checked = PETSC_TRUE;
4383: c->inode.node_count = a->inode.node_count;
4384: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4385: }
4386: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4387: if (!B->factortype) {
4388: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4389: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4390: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4391: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4392: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4393: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4394: } else {
4395: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4396: }
4397: }
4398: PetscFunctionReturn(PETSC_SUCCESS);
4399: }
4401: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4402: {
4403: PetscInt k;
4404: const PetscInt *vi;
4406: PetscFunctionBegin;
4407: vi = aj + ai[row];
4408: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4409: vi = aj + adiag[row];
4410: cols[nzl] = vi[0];
4411: vi = aj + adiag[row + 1] + 1;
4412: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4413: PetscFunctionReturn(PETSC_SUCCESS);
4414: }
4415: /*
4416: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4417: Modified from MatSeqAIJCheckInode().
4419: Input Parameters:
4420: . Mat A - ILU or LU matrix factor
4422: */
4423: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4424: {
4425: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4426: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4427: PetscInt *cols1, *cols2, *ns;
4428: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4429: PetscBool flag;
4431: PetscFunctionBegin;
4432: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4433: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4435: m = A->rmap->n;
4436: if (a->inode.size) ns = a->inode.size;
4437: else PetscCall(PetscMalloc1(m + 1, &ns));
4439: i = 0;
4440: node_count = 0;
4441: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4442: while (i < m) { /* For each row */
4443: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4444: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4445: nzx = nzl1 + nzu1 + 1;
4446: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4448: /* Limits the number of elements in a node to 'a->inode.limit' */
4449: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4450: nzl2 = ai[j + 1] - ai[j];
4451: nzu2 = adiag[j] - adiag[j + 1] - 1;
4452: nzy = nzl2 + nzu2 + 1;
4453: if (nzy != nzx) break;
4454: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4455: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4456: if (!flag) break;
4457: }
4458: ns[node_count++] = blk_size;
4459: i = j;
4460: }
4461: PetscCall(PetscFree2(cols1, cols2));
4462: /* If not enough inodes found,, do not use inode version of the routines */
4463: if (!m || node_count > .8 * m) {
4464: PetscCall(PetscFree(ns));
4466: a->inode.node_count = 0;
4467: a->inode.size = NULL;
4468: a->inode.use = PETSC_FALSE;
4470: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4471: } else {
4472: A->ops->mult = NULL;
4473: A->ops->sor = NULL;
4474: A->ops->multadd = NULL;
4475: A->ops->getrowij = NULL;
4476: A->ops->restorerowij = NULL;
4477: A->ops->getcolumnij = NULL;
4478: A->ops->restorecolumnij = NULL;
4479: A->ops->coloringpatch = NULL;
4480: A->ops->multdiagonalblock = NULL;
4481: a->inode.node_count = node_count;
4482: a->inode.size = ns;
4484: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4485: }
4486: a->inode.checked = PETSC_TRUE;
4487: PetscFunctionReturn(PETSC_SUCCESS);
4488: }
4490: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4491: {
4492: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4494: PetscFunctionBegin;
4495: a->inode.ibdiagvalid = PETSC_FALSE;
4496: PetscFunctionReturn(PETSC_SUCCESS);
4497: }
4499: /*
4500: This is really ugly. if inodes are used this replaces the
4501: permutations with ones that correspond to rows/cols of the matrix
4502: rather than inode blocks
4503: */
4504: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4505: {
4506: PetscFunctionBegin;
4507: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4508: PetscFunctionReturn(PETSC_SUCCESS);
4509: }
4511: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4512: {
4513: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4514: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4515: const PetscInt *ridx, *cidx;
4516: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4517: PetscInt nslim_col, *ns_col;
4518: IS ris = *rperm, cis = *cperm;
4520: PetscFunctionBegin;
4521: if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4522: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4524: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4525: PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4526: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4528: PetscCall(ISGetIndices(ris, &ridx));
4529: PetscCall(ISGetIndices(cis, &cidx));
4531: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4532: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4534: /* Construct the permutations for rows*/
4535: for (i = 0, row = 0; i < nslim_row; ++i) {
4536: indx = ridx[i];
4537: start_val = tns[indx];
4538: end_val = tns[indx + 1];
4539: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4540: }
4542: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4543: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4545: /* Construct permutations for columns */
4546: for (i = 0, col = 0; i < nslim_col; ++i) {
4547: indx = cidx[i];
4548: start_val = tns[indx];
4549: end_val = tns[indx + 1];
4550: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4551: }
4553: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4554: PetscCall(ISSetPermutation(*rperm));
4555: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4556: PetscCall(ISSetPermutation(*cperm));
4558: PetscCall(ISRestoreIndices(ris, &ridx));
4559: PetscCall(ISRestoreIndices(cis, &cidx));
4561: PetscCall(PetscFree(ns_col));
4562: PetscCall(PetscFree2(permr, permc));
4563: PetscCall(ISDestroy(&cis));
4564: PetscCall(ISDestroy(&ris));
4565: PetscCall(PetscFree(tns));
4566: PetscFunctionReturn(PETSC_SUCCESS);
4567: }
4569: /*@C
4570: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4572: Not Collective
4574: Input Parameter:
4575: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4577: Output Parameters:
4578: + node_count - no of inodes present in the matrix.
4579: . sizes - an array of size `node_count`, with the sizes of each inode.
4580: - limit - the max size used to generate the inodes.
4582: Level: advanced
4584: Note:
4585: It should be called after the matrix is assembled.
4586: The contents of the sizes[] array should not be changed.
4587: `NULL` may be passed for information not needed
4589: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4590: @*/
4591: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4592: {
4593: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4595: PetscFunctionBegin;
4596: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4597: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4598: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4599: PetscFunctionReturn(PETSC_SUCCESS);
4600: }
4602: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4603: {
4604: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4606: PetscFunctionBegin;
4607: if (node_count) *node_count = a->inode.node_count;
4608: if (sizes) *sizes = a->inode.size;
4609: if (limit) *limit = a->inode.limit;
4610: PetscFunctionReturn(PETSC_SUCCESS);
4611: }