Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++);
24: for (; count + 1 < n; count++, i++);
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
31: /* Use the same row structure wherever feasible. */
32: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
34: /* if m < n; pad up the remainder with inode_limit */
35: for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36: /* The last node is the odd ball. pad it up with the remaining rows; */
37: if (count < n) {
38: ns_col[i] = n - count;
39: i++;
40: } else if (count > n) {
41: /* Adjust for the over estimation */
42: ns_col[i - 1] += n - count;
43: }
44: *size = i;
45: *ns = ns_col;
46: PetscFunctionReturn(PETSC_SUCCESS);
47: }
49: /*
50: This builds symmetric version of nonzero structure,
51: */
52: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53: {
54: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
55: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56: PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59: PetscFunctionBegin;
60: nslim_row = a->inode.node_count;
61: m = A->rmap->n;
62: n = A->cmap->n;
63: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66: /* Use the row_inode as column_inode */
67: nslim_col = nslim_row;
68: ns_col = ns_row;
70: /* allocate space for reformatted inode structure */
71: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
74: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75: nsz = ns_col[i1];
76: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77: }
78: /* allocate space for row pointers */
79: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80: *iia = ia;
81: PetscCall(PetscMalloc1(nslim_row + 1, &work));
83: /* determine the number of columns in each row */
84: ia[0] = oshift;
85: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86: j = aj + ai[row] + ishift;
87: jmax = aj + ai[row + 1] + ishift;
88: if (j == jmax) continue; /* empty row */
89: col = *j++ + ishift;
90: i2 = tvc[col];
91: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92: ia[i1 + 1]++;
93: ia[i2 + 1]++;
94: i2++; /* Start col of next node */
95: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96: i2 = tvc[col];
97: }
98: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99: }
101: /* shift ia[i] to point to next row */
102: for (i1 = 1; i1 < nslim_row + 1; i1++) {
103: row = ia[i1 - 1];
104: ia[i1] += row;
105: work[i1 - 1] = row - oshift;
106: }
108: /* allocate space for column pointers */
109: nz = ia[nslim_row] + (!ishift);
110: PetscCall(PetscMalloc1(nz, &ja));
111: *jja = ja;
113: /* loop over lower triangular part putting into ja */
114: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115: j = aj + ai[row] + ishift;
116: jmax = aj + ai[row + 1] + ishift;
117: if (j == jmax) continue; /* empty row */
118: col = *j++ + ishift;
119: i2 = tvc[col];
120: while (i2 < i1 && j < jmax) {
121: ja[work[i2]++] = i1 + oshift;
122: ja[work[i1]++] = i2 + oshift;
123: ++i2;
124: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125: i2 = tvc[col];
126: }
127: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128: }
129: PetscCall(PetscFree(work));
130: PetscCall(PetscFree2(tns, tvc));
131: PetscFunctionReturn(PETSC_SUCCESS);
132: }
134: /*
135: This builds nonsymmetric version of nonzero structure,
136: */
137: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138: {
139: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
140: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141: PetscInt *tns, *tvc, nsz, i1, i2;
142: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
144: PetscFunctionBegin;
145: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146: nslim_row = a->inode.node_count;
147: n = A->cmap->n;
149: /* Create The column_inode for this matrix */
150: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
152: /* allocate space for reformatted column_inode structure */
153: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
156: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157: nsz = ns_col[i1];
158: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159: }
160: /* allocate space for row pointers */
161: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162: *iia = ia;
163: PetscCall(PetscMalloc1(nslim_row + 1, &work));
165: /* determine the number of columns in each row */
166: ia[0] = oshift;
167: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168: j = aj + ai[row] + ishift;
169: nz = ai[row + 1] - ai[row];
170: if (!nz) continue; /* empty row */
171: col = *j++ + ishift;
172: i2 = tvc[col];
173: while (nz-- > 0) { /* off-diagonal elements */
174: ia[i1 + 1]++;
175: i2++; /* Start col of next node */
176: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177: if (nz > 0) i2 = tvc[col];
178: }
179: }
181: /* shift ia[i] to point to next row */
182: for (i1 = 1; i1 < nslim_row + 1; i1++) {
183: row = ia[i1 - 1];
184: ia[i1] += row;
185: work[i1 - 1] = row - oshift;
186: }
188: /* allocate space for column pointers */
189: nz = ia[nslim_row] + (!ishift);
190: PetscCall(PetscMalloc1(nz, &ja));
191: *jja = ja;
193: /* loop over matrix putting into ja */
194: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195: j = aj + ai[row] + ishift;
196: nz = ai[row + 1] - ai[row];
197: if (!nz) continue; /* empty row */
198: col = *j++ + ishift;
199: i2 = tvc[col];
200: while (nz-- > 0) {
201: ja[work[i1]++] = i2 + oshift;
202: ++i2;
203: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204: if (nz > 0) i2 = tvc[col];
205: }
206: }
207: PetscCall(PetscFree(ns_col));
208: PetscCall(PetscFree(work));
209: PetscCall(PetscFree2(tns, tvc));
210: PetscFunctionReturn(PETSC_SUCCESS);
211: }
213: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214: {
215: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
217: PetscFunctionBegin;
218: if (n) *n = a->inode.node_count;
219: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220: if (!blockcompressed) {
221: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222: } else if (symmetric) {
223: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224: } else {
225: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226: }
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231: {
232: PetscFunctionBegin;
233: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
235: if (!blockcompressed) {
236: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237: } else {
238: PetscCall(PetscFree(*ia));
239: PetscCall(PetscFree(*ja));
240: }
241: PetscFunctionReturn(PETSC_SUCCESS);
242: }
244: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245: {
246: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
247: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
248: PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
250: PetscFunctionBegin;
251: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
252: nslim_row = a->inode.node_count;
253: n = A->cmap->n;
255: /* Create The column_inode for this matrix */
256: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
258: /* allocate space for reformatted column_inode structure */
259: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
260: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
262: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
263: nsz = ns_col[i1];
264: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
265: }
266: /* allocate space for column pointers */
267: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
268: *iia = ia;
269: PetscCall(PetscMalloc1(nslim_col + 1, &work));
271: /* determine the number of columns in each row */
272: ia[0] = oshift;
273: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
274: j = aj + ai[row] + ishift;
275: col = *j++ + ishift;
276: i2 = tvc[col];
277: nz = ai[row + 1] - ai[row];
278: while (nz-- > 0) { /* off-diagonal elements */
279: /* ia[i1+1]++; */
280: ia[i2 + 1]++;
281: i2++;
282: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
283: if (nz > 0) i2 = tvc[col];
284: }
285: }
287: /* shift ia[i] to point to next col */
288: for (i1 = 1; i1 < nslim_col + 1; i1++) {
289: col = ia[i1 - 1];
290: ia[i1] += col;
291: work[i1 - 1] = col - oshift;
292: }
294: /* allocate space for column pointers */
295: nz = ia[nslim_col] + (!ishift);
296: PetscCall(PetscMalloc1(nz, &ja));
297: *jja = ja;
299: /* loop over matrix putting into ja */
300: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
301: j = aj + ai[row] + ishift;
302: col = *j++ + ishift;
303: i2 = tvc[col];
304: nz = ai[row + 1] - ai[row];
305: while (nz-- > 0) {
306: /* ja[work[i1]++] = i2 + oshift; */
307: ja[work[i2]++] = i1 + oshift;
308: i2++;
309: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
310: if (nz > 0) i2 = tvc[col];
311: }
312: }
313: PetscCall(PetscFree(ns_col));
314: PetscCall(PetscFree(work));
315: PetscCall(PetscFree2(tns, tvc));
316: PetscFunctionReturn(PETSC_SUCCESS);
317: }
319: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320: {
321: PetscFunctionBegin;
322: PetscCall(MatCreateColInode_Private(A, n, NULL));
323: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
325: if (!blockcompressed) {
326: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
327: } else if (symmetric) {
328: /* Since the indices are symmetric it doesn't matter */
329: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
330: } else {
331: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
332: }
333: PetscFunctionReturn(PETSC_SUCCESS);
334: }
336: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337: {
338: PetscFunctionBegin;
339: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
340: if (!blockcompressed) {
341: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
342: } else {
343: PetscCall(PetscFree(*ia));
344: PetscCall(PetscFree(*ja));
345: }
346: PetscFunctionReturn(PETSC_SUCCESS);
347: }
349: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350: {
351: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
352: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353: PetscScalar *y;
354: const PetscScalar *x;
355: const MatScalar *v1, *v2, *v3, *v4, *v5;
356: PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
357: const PetscInt *idx, *ns, *ii;
359: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
360: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
361: #endif
363: PetscFunctionBegin;
364: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
365: node_max = a->inode.node_count;
366: ns = a->inode.size; /* Node Size array */
367: PetscCall(VecGetArrayRead(xx, &x));
368: PetscCall(VecGetArray(yy, &y));
369: idx = a->j;
370: v1 = a->a;
371: ii = a->i;
373: for (i = 0, row = 0; i < node_max; ++i) {
374: nsz = ns[i];
375: n = ii[1] - ii[0];
376: nonzerorow += (n > 0) * nsz;
377: ii += nsz;
378: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
379: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
380: sz = n; /* No of non zeros in this row */
381: /* Switch on the size of Node */
382: switch (nsz) { /* Each loop in 'case' is unrolled */
383: case 1:
384: sum1 = 0.;
386: for (n = 0; n < sz - 1; n += 2) {
387: i1 = idx[0]; /* The instructions are ordered to */
388: i2 = idx[1]; /* make the compiler's job easy */
389: idx += 2;
390: tmp0 = x[i1];
391: tmp1 = x[i2];
392: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
393: v1 += 2;
394: }
396: if (n == sz - 1) { /* Take care of the last nonzero */
397: tmp0 = x[*idx++];
398: sum1 += *v1++ * tmp0;
399: }
400: y[row++] = sum1;
401: break;
402: case 2:
403: sum1 = 0.;
404: sum2 = 0.;
405: v2 = v1 + n;
407: for (n = 0; n < sz - 1; n += 2) {
408: i1 = idx[0];
409: i2 = idx[1];
410: idx += 2;
411: tmp0 = x[i1];
412: tmp1 = x[i2];
413: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
414: v1 += 2;
415: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
416: v2 += 2;
417: }
418: if (n == sz - 1) {
419: tmp0 = x[*idx++];
420: sum1 += *v1++ * tmp0;
421: sum2 += *v2++ * tmp0;
422: }
423: y[row++] = sum1;
424: y[row++] = sum2;
425: v1 = v2; /* Since the next block to be processed starts there*/
426: idx += sz;
427: break;
428: case 3:
429: sum1 = 0.;
430: sum2 = 0.;
431: sum3 = 0.;
432: v2 = v1 + n;
433: v3 = v2 + n;
435: for (n = 0; n < sz - 1; n += 2) {
436: i1 = idx[0];
437: i2 = idx[1];
438: idx += 2;
439: tmp0 = x[i1];
440: tmp1 = x[i2];
441: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
442: v1 += 2;
443: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
444: v2 += 2;
445: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
446: v3 += 2;
447: }
448: if (n == sz - 1) {
449: tmp0 = x[*idx++];
450: sum1 += *v1++ * tmp0;
451: sum2 += *v2++ * tmp0;
452: sum3 += *v3++ * tmp0;
453: }
454: y[row++] = sum1;
455: y[row++] = sum2;
456: y[row++] = sum3;
457: v1 = v3; /* Since the next block to be processed starts there*/
458: idx += 2 * sz;
459: break;
460: case 4:
461: sum1 = 0.;
462: sum2 = 0.;
463: sum3 = 0.;
464: sum4 = 0.;
465: v2 = v1 + n;
466: v3 = v2 + n;
467: v4 = v3 + n;
469: for (n = 0; n < sz - 1; n += 2) {
470: i1 = idx[0];
471: i2 = idx[1];
472: idx += 2;
473: tmp0 = x[i1];
474: tmp1 = x[i2];
475: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
476: v1 += 2;
477: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
478: v2 += 2;
479: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
480: v3 += 2;
481: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
482: v4 += 2;
483: }
484: if (n == sz - 1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: sum4 += *v4++ * tmp0;
490: }
491: y[row++] = sum1;
492: y[row++] = sum2;
493: y[row++] = sum3;
494: y[row++] = sum4;
495: v1 = v4; /* Since the next block to be processed starts there*/
496: idx += 3 * sz;
497: break;
498: case 5:
499: sum1 = 0.;
500: sum2 = 0.;
501: sum3 = 0.;
502: sum4 = 0.;
503: sum5 = 0.;
504: v2 = v1 + n;
505: v3 = v2 + n;
506: v4 = v3 + n;
507: v5 = v4 + n;
509: for (n = 0; n < sz - 1; n += 2) {
510: i1 = idx[0];
511: i2 = idx[1];
512: idx += 2;
513: tmp0 = x[i1];
514: tmp1 = x[i2];
515: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
516: v1 += 2;
517: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
518: v2 += 2;
519: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
520: v3 += 2;
521: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
522: v4 += 2;
523: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
524: v5 += 2;
525: }
526: if (n == sz - 1) {
527: tmp0 = x[*idx++];
528: sum1 += *v1++ * tmp0;
529: sum2 += *v2++ * tmp0;
530: sum3 += *v3++ * tmp0;
531: sum4 += *v4++ * tmp0;
532: sum5 += *v5++ * tmp0;
533: }
534: y[row++] = sum1;
535: y[row++] = sum2;
536: y[row++] = sum3;
537: y[row++] = sum4;
538: y[row++] = sum5;
539: v1 = v5; /* Since the next block to be processed starts there */
540: idx += 4 * sz;
541: break;
542: default:
543: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
544: }
545: }
546: PetscCall(VecRestoreArrayRead(xx, &x));
547: PetscCall(VecRestoreArray(yy, &y));
548: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
549: PetscFunctionReturn(PETSC_SUCCESS);
550: }
552: /* Almost same code as the MatMult_SeqAIJ_Inode() */
553: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554: {
555: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
556: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
557: const MatScalar *v1, *v2, *v3, *v4, *v5;
558: const PetscScalar *x;
559: PetscScalar *y, *z, *zt;
560: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
561: const PetscInt *idx, *ns, *ii;
563: PetscFunctionBegin;
564: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
565: node_max = a->inode.node_count;
566: ns = a->inode.size; /* Node Size array */
568: PetscCall(VecGetArrayRead(xx, &x));
569: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
570: zt = z;
572: idx = a->j;
573: v1 = a->a;
574: ii = a->i;
576: for (i = 0, row = 0; i < node_max; ++i) {
577: nsz = ns[i];
578: n = ii[1] - ii[0];
579: ii += nsz;
580: sz = n; /* No of non zeros in this row */
581: /* Switch on the size of Node */
582: switch (nsz) { /* Each loop in 'case' is unrolled */
583: case 1:
584: sum1 = *zt++;
586: for (n = 0; n < sz - 1; n += 2) {
587: i1 = idx[0]; /* The instructions are ordered to */
588: i2 = idx[1]; /* make the compiler's job easy */
589: idx += 2;
590: tmp0 = x[i1];
591: tmp1 = x[i2];
592: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
593: v1 += 2;
594: }
596: if (n == sz - 1) { /* Take care of the last nonzero */
597: tmp0 = x[*idx++];
598: sum1 += *v1++ * tmp0;
599: }
600: y[row++] = sum1;
601: break;
602: case 2:
603: sum1 = *zt++;
604: sum2 = *zt++;
605: v2 = v1 + n;
607: for (n = 0; n < sz - 1; n += 2) {
608: i1 = idx[0];
609: i2 = idx[1];
610: idx += 2;
611: tmp0 = x[i1];
612: tmp1 = x[i2];
613: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
614: v1 += 2;
615: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
616: v2 += 2;
617: }
618: if (n == sz - 1) {
619: tmp0 = x[*idx++];
620: sum1 += *v1++ * tmp0;
621: sum2 += *v2++ * tmp0;
622: }
623: y[row++] = sum1;
624: y[row++] = sum2;
625: v1 = v2; /* Since the next block to be processed starts there*/
626: idx += sz;
627: break;
628: case 3:
629: sum1 = *zt++;
630: sum2 = *zt++;
631: sum3 = *zt++;
632: v2 = v1 + n;
633: v3 = v2 + n;
635: for (n = 0; n < sz - 1; n += 2) {
636: i1 = idx[0];
637: i2 = idx[1];
638: idx += 2;
639: tmp0 = x[i1];
640: tmp1 = x[i2];
641: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
642: v1 += 2;
643: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
644: v2 += 2;
645: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
646: v3 += 2;
647: }
648: if (n == sz - 1) {
649: tmp0 = x[*idx++];
650: sum1 += *v1++ * tmp0;
651: sum2 += *v2++ * tmp0;
652: sum3 += *v3++ * tmp0;
653: }
654: y[row++] = sum1;
655: y[row++] = sum2;
656: y[row++] = sum3;
657: v1 = v3; /* Since the next block to be processed starts there*/
658: idx += 2 * sz;
659: break;
660: case 4:
661: sum1 = *zt++;
662: sum2 = *zt++;
663: sum3 = *zt++;
664: sum4 = *zt++;
665: v2 = v1 + n;
666: v3 = v2 + n;
667: v4 = v3 + n;
669: for (n = 0; n < sz - 1; n += 2) {
670: i1 = idx[0];
671: i2 = idx[1];
672: idx += 2;
673: tmp0 = x[i1];
674: tmp1 = x[i2];
675: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
676: v1 += 2;
677: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
678: v2 += 2;
679: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
680: v3 += 2;
681: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
682: v4 += 2;
683: }
684: if (n == sz - 1) {
685: tmp0 = x[*idx++];
686: sum1 += *v1++ * tmp0;
687: sum2 += *v2++ * tmp0;
688: sum3 += *v3++ * tmp0;
689: sum4 += *v4++ * tmp0;
690: }
691: y[row++] = sum1;
692: y[row++] = sum2;
693: y[row++] = sum3;
694: y[row++] = sum4;
695: v1 = v4; /* Since the next block to be processed starts there*/
696: idx += 3 * sz;
697: break;
698: case 5:
699: sum1 = *zt++;
700: sum2 = *zt++;
701: sum3 = *zt++;
702: sum4 = *zt++;
703: sum5 = *zt++;
704: v2 = v1 + n;
705: v3 = v2 + n;
706: v4 = v3 + n;
707: v5 = v4 + n;
709: for (n = 0; n < sz - 1; n += 2) {
710: i1 = idx[0];
711: i2 = idx[1];
712: idx += 2;
713: tmp0 = x[i1];
714: tmp1 = x[i2];
715: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
716: v1 += 2;
717: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
718: v2 += 2;
719: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
720: v3 += 2;
721: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
722: v4 += 2;
723: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
724: v5 += 2;
725: }
726: if (n == sz - 1) {
727: tmp0 = x[*idx++];
728: sum1 += *v1++ * tmp0;
729: sum2 += *v2++ * tmp0;
730: sum3 += *v3++ * tmp0;
731: sum4 += *v4++ * tmp0;
732: sum5 += *v5++ * tmp0;
733: }
734: y[row++] = sum1;
735: y[row++] = sum2;
736: y[row++] = sum3;
737: y[row++] = sum4;
738: y[row++] = sum5;
739: v1 = v5; /* Since the next block to be processed starts there */
740: idx += 4 * sz;
741: break;
742: default:
743: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
744: }
745: }
746: PetscCall(VecRestoreArrayRead(xx, &x));
747: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
748: PetscCall(PetscLogFlops(2.0 * a->nz));
749: PetscFunctionReturn(PETSC_SUCCESS);
750: }
752: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753: {
754: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
755: IS iscol = a->col, isrow = a->row;
756: const PetscInt *r, *c, *rout, *cout;
757: PetscInt i, j, n = A->rmap->n, nz;
758: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
759: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
761: PetscScalar sum1, sum2, sum3, sum4, sum5;
762: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763: const PetscScalar *b;
765: PetscFunctionBegin;
766: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
767: node_max = a->inode.node_count;
768: ns = a->inode.size; /* Node Size array */
770: PetscCall(VecGetArrayRead(bb, &b));
771: PetscCall(VecGetArrayWrite(xx, &x));
772: tmp = a->solve_work;
774: PetscCall(ISGetIndices(isrow, &rout));
775: r = rout;
776: PetscCall(ISGetIndices(iscol, &cout));
777: c = cout + (n - 1);
779: /* forward solve the lower triangular */
780: tmps = tmp;
781: aa = a_a;
782: aj = a_j;
783: ad = a->diag;
785: for (i = 0, row = 0; i < node_max; ++i) {
786: nsz = ns[i];
787: aii = ai[row];
788: v1 = aa + aii;
789: vi = aj + aii;
790: nz = ad[row] - aii;
791: if (i < node_max - 1) {
792: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
793: * but our indexing to determine its size could. */
794: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
795: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
796: PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
797: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
798: }
800: switch (nsz) { /* Each loop in 'case' is unrolled */
801: case 1:
802: sum1 = b[*r++];
803: for (j = 0; j < nz - 1; j += 2) {
804: i0 = vi[0];
805: i1 = vi[1];
806: vi += 2;
807: tmp0 = tmps[i0];
808: tmp1 = tmps[i1];
809: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
810: v1 += 2;
811: }
812: if (j == nz - 1) {
813: tmp0 = tmps[*vi++];
814: sum1 -= *v1++ * tmp0;
815: }
816: tmp[row++] = sum1;
817: break;
818: case 2:
819: sum1 = b[*r++];
820: sum2 = b[*r++];
821: v2 = aa + ai[row + 1];
823: for (j = 0; j < nz - 1; j += 2) {
824: i0 = vi[0];
825: i1 = vi[1];
826: vi += 2;
827: tmp0 = tmps[i0];
828: tmp1 = tmps[i1];
829: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
830: v1 += 2;
831: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
832: v2 += 2;
833: }
834: if (j == nz - 1) {
835: tmp0 = tmps[*vi++];
836: sum1 -= *v1++ * tmp0;
837: sum2 -= *v2++ * tmp0;
838: }
839: sum2 -= *v2++ * sum1;
840: tmp[row++] = sum1;
841: tmp[row++] = sum2;
842: break;
843: case 3:
844: sum1 = b[*r++];
845: sum2 = b[*r++];
846: sum3 = b[*r++];
847: v2 = aa + ai[row + 1];
848: v3 = aa + ai[row + 2];
850: for (j = 0; j < nz - 1; j += 2) {
851: i0 = vi[0];
852: i1 = vi[1];
853: vi += 2;
854: tmp0 = tmps[i0];
855: tmp1 = tmps[i1];
856: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
857: v1 += 2;
858: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
859: v2 += 2;
860: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
861: v3 += 2;
862: }
863: if (j == nz - 1) {
864: tmp0 = tmps[*vi++];
865: sum1 -= *v1++ * tmp0;
866: sum2 -= *v2++ * tmp0;
867: sum3 -= *v3++ * tmp0;
868: }
869: sum2 -= *v2++ * sum1;
870: sum3 -= *v3++ * sum1;
871: sum3 -= *v3++ * sum2;
873: tmp[row++] = sum1;
874: tmp[row++] = sum2;
875: tmp[row++] = sum3;
876: break;
878: case 4:
879: sum1 = b[*r++];
880: sum2 = b[*r++];
881: sum3 = b[*r++];
882: sum4 = b[*r++];
883: v2 = aa + ai[row + 1];
884: v3 = aa + ai[row + 2];
885: v4 = aa + ai[row + 3];
887: for (j = 0; j < nz - 1; j += 2) {
888: i0 = vi[0];
889: i1 = vi[1];
890: vi += 2;
891: tmp0 = tmps[i0];
892: tmp1 = tmps[i1];
893: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
894: v1 += 2;
895: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
896: v2 += 2;
897: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
898: v3 += 2;
899: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
900: v4 += 2;
901: }
902: if (j == nz - 1) {
903: tmp0 = tmps[*vi++];
904: sum1 -= *v1++ * tmp0;
905: sum2 -= *v2++ * tmp0;
906: sum3 -= *v3++ * tmp0;
907: sum4 -= *v4++ * tmp0;
908: }
909: sum2 -= *v2++ * sum1;
910: sum3 -= *v3++ * sum1;
911: sum4 -= *v4++ * sum1;
912: sum3 -= *v3++ * sum2;
913: sum4 -= *v4++ * sum2;
914: sum4 -= *v4++ * sum3;
916: tmp[row++] = sum1;
917: tmp[row++] = sum2;
918: tmp[row++] = sum3;
919: tmp[row++] = sum4;
920: break;
921: case 5:
922: sum1 = b[*r++];
923: sum2 = b[*r++];
924: sum3 = b[*r++];
925: sum4 = b[*r++];
926: sum5 = b[*r++];
927: v2 = aa + ai[row + 1];
928: v3 = aa + ai[row + 2];
929: v4 = aa + ai[row + 3];
930: v5 = aa + ai[row + 4];
932: for (j = 0; j < nz - 1; j += 2) {
933: i0 = vi[0];
934: i1 = vi[1];
935: vi += 2;
936: tmp0 = tmps[i0];
937: tmp1 = tmps[i1];
938: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
939: v1 += 2;
940: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
941: v2 += 2;
942: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
943: v3 += 2;
944: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
945: v4 += 2;
946: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
947: v5 += 2;
948: }
949: if (j == nz - 1) {
950: tmp0 = tmps[*vi++];
951: sum1 -= *v1++ * tmp0;
952: sum2 -= *v2++ * tmp0;
953: sum3 -= *v3++ * tmp0;
954: sum4 -= *v4++ * tmp0;
955: sum5 -= *v5++ * tmp0;
956: }
958: sum2 -= *v2++ * sum1;
959: sum3 -= *v3++ * sum1;
960: sum4 -= *v4++ * sum1;
961: sum5 -= *v5++ * sum1;
962: sum3 -= *v3++ * sum2;
963: sum4 -= *v4++ * sum2;
964: sum5 -= *v5++ * sum2;
965: sum4 -= *v4++ * sum3;
966: sum5 -= *v5++ * sum3;
967: sum5 -= *v5++ * sum4;
969: tmp[row++] = sum1;
970: tmp[row++] = sum2;
971: tmp[row++] = sum3;
972: tmp[row++] = sum4;
973: tmp[row++] = sum5;
974: break;
975: default:
976: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
977: }
978: }
979: /* backward solve the upper triangular */
980: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
981: nsz = ns[i];
982: aii = ai[row + 1] - 1;
983: v1 = aa + aii;
984: vi = aj + aii;
985: nz = aii - ad[row];
986: switch (nsz) { /* Each loop in 'case' is unrolled */
987: case 1:
988: sum1 = tmp[row];
990: for (j = nz; j > 1; j -= 2) {
991: vi -= 2;
992: i0 = vi[2];
993: i1 = vi[1];
994: tmp0 = tmps[i0];
995: tmp1 = tmps[i1];
996: v1 -= 2;
997: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
998: }
999: if (j == 1) {
1000: tmp0 = tmps[*vi--];
1001: sum1 -= *v1-- * tmp0;
1002: }
1003: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1004: row--;
1005: break;
1006: case 2:
1007: sum1 = tmp[row];
1008: sum2 = tmp[row - 1];
1009: v2 = aa + ai[row] - 1;
1010: for (j = nz; j > 1; j -= 2) {
1011: vi -= 2;
1012: i0 = vi[2];
1013: i1 = vi[1];
1014: tmp0 = tmps[i0];
1015: tmp1 = tmps[i1];
1016: v1 -= 2;
1017: v2 -= 2;
1018: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020: }
1021: if (j == 1) {
1022: tmp0 = tmps[*vi--];
1023: sum1 -= *v1-- * tmp0;
1024: sum2 -= *v2-- * tmp0;
1025: }
1027: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1028: row--;
1029: sum2 -= *v2-- * tmp0;
1030: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1031: row--;
1032: break;
1033: case 3:
1034: sum1 = tmp[row];
1035: sum2 = tmp[row - 1];
1036: sum3 = tmp[row - 2];
1037: v2 = aa + ai[row] - 1;
1038: v3 = aa + ai[row - 1] - 1;
1039: for (j = nz; j > 1; j -= 2) {
1040: vi -= 2;
1041: i0 = vi[2];
1042: i1 = vi[1];
1043: tmp0 = tmps[i0];
1044: tmp1 = tmps[i1];
1045: v1 -= 2;
1046: v2 -= 2;
1047: v3 -= 2;
1048: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051: }
1052: if (j == 1) {
1053: tmp0 = tmps[*vi--];
1054: sum1 -= *v1-- * tmp0;
1055: sum2 -= *v2-- * tmp0;
1056: sum3 -= *v3-- * tmp0;
1057: }
1058: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1059: row--;
1060: sum2 -= *v2-- * tmp0;
1061: sum3 -= *v3-- * tmp0;
1062: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1063: row--;
1064: sum3 -= *v3-- * tmp0;
1065: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1066: row--;
1068: break;
1069: case 4:
1070: sum1 = tmp[row];
1071: sum2 = tmp[row - 1];
1072: sum3 = tmp[row - 2];
1073: sum4 = tmp[row - 3];
1074: v2 = aa + ai[row] - 1;
1075: v3 = aa + ai[row - 1] - 1;
1076: v4 = aa + ai[row - 2] - 1;
1078: for (j = nz; j > 1; j -= 2) {
1079: vi -= 2;
1080: i0 = vi[2];
1081: i1 = vi[1];
1082: tmp0 = tmps[i0];
1083: tmp1 = tmps[i1];
1084: v1 -= 2;
1085: v2 -= 2;
1086: v3 -= 2;
1087: v4 -= 2;
1088: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092: }
1093: if (j == 1) {
1094: tmp0 = tmps[*vi--];
1095: sum1 -= *v1-- * tmp0;
1096: sum2 -= *v2-- * tmp0;
1097: sum3 -= *v3-- * tmp0;
1098: sum4 -= *v4-- * tmp0;
1099: }
1101: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1102: row--;
1103: sum2 -= *v2-- * tmp0;
1104: sum3 -= *v3-- * tmp0;
1105: sum4 -= *v4-- * tmp0;
1106: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1107: row--;
1108: sum3 -= *v3-- * tmp0;
1109: sum4 -= *v4-- * tmp0;
1110: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1111: row--;
1112: sum4 -= *v4-- * tmp0;
1113: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1114: row--;
1115: break;
1116: case 5:
1117: sum1 = tmp[row];
1118: sum2 = tmp[row - 1];
1119: sum3 = tmp[row - 2];
1120: sum4 = tmp[row - 3];
1121: sum5 = tmp[row - 4];
1122: v2 = aa + ai[row] - 1;
1123: v3 = aa + ai[row - 1] - 1;
1124: v4 = aa + ai[row - 2] - 1;
1125: v5 = aa + ai[row - 3] - 1;
1126: for (j = nz; j > 1; j -= 2) {
1127: vi -= 2;
1128: i0 = vi[2];
1129: i1 = vi[1];
1130: tmp0 = tmps[i0];
1131: tmp1 = tmps[i1];
1132: v1 -= 2;
1133: v2 -= 2;
1134: v3 -= 2;
1135: v4 -= 2;
1136: v5 -= 2;
1137: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142: }
1143: if (j == 1) {
1144: tmp0 = tmps[*vi--];
1145: sum1 -= *v1-- * tmp0;
1146: sum2 -= *v2-- * tmp0;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: }
1152: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1153: row--;
1154: sum2 -= *v2-- * tmp0;
1155: sum3 -= *v3-- * tmp0;
1156: sum4 -= *v4-- * tmp0;
1157: sum5 -= *v5-- * tmp0;
1158: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1159: row--;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1164: row--;
1165: sum4 -= *v4-- * tmp0;
1166: sum5 -= *v5-- * tmp0;
1167: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1168: row--;
1169: sum5 -= *v5-- * tmp0;
1170: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1171: row--;
1172: break;
1173: default:
1174: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1175: }
1176: }
1177: PetscCall(ISRestoreIndices(isrow, &rout));
1178: PetscCall(ISRestoreIndices(iscol, &cout));
1179: PetscCall(VecRestoreArrayRead(bb, &b));
1180: PetscCall(VecRestoreArrayWrite(xx, &x));
1181: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1182: PetscFunctionReturn(PETSC_SUCCESS);
1183: }
1185: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186: {
1187: Mat C = B;
1188: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1189: IS isrow = b->row, isicol = b->icol;
1190: const PetscInt *r, *ic, *ics;
1191: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1192: PetscInt i, j, k, nz, nzL, row, *pj;
1193: const PetscInt *ajtmp, *bjtmp;
1194: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1195: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1196: FactorShiftCtx sctx;
1197: const PetscInt *ddiag;
1198: PetscReal rs;
1199: MatScalar d;
1200: PetscInt inod, nodesz, node_max, col;
1201: const PetscInt *ns;
1202: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1204: PetscFunctionBegin;
1205: /* MatPivotSetUp(): initialize shift context sctx */
1206: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1208: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1209: ddiag = a->diag;
1210: sctx.shift_top = info->zeropivot;
1211: for (i = 0; i < n; i++) {
1212: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1213: d = (aa)[ddiag[i]];
1214: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1215: v = aa + ai[i];
1216: nz = ai[i + 1] - ai[i];
1217: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1218: if (rs > sctx.shift_top) sctx.shift_top = rs;
1219: }
1220: sctx.shift_top *= 1.1;
1221: sctx.nshift_max = 5;
1222: sctx.shift_lo = 0.;
1223: sctx.shift_hi = 1.;
1224: }
1226: PetscCall(ISGetIndices(isrow, &r));
1227: PetscCall(ISGetIndices(isicol, &ic));
1229: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1230: ics = ic;
1232: node_max = a->inode.node_count;
1233: ns = a->inode.size;
1234: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1236: /* If max inode size > 4, split it into two inodes.*/
1237: /* also map the inode sizes according to the ordering */
1238: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1239: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240: if (ns[i] > 4) {
1241: tmp_vec1[j] = 4;
1242: ++j;
1243: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1244: } else {
1245: tmp_vec1[j] = ns[i];
1246: }
1247: }
1248: /* Use the correct node_max */
1249: node_max = j;
1251: /* Now reorder the inode info based on mat re-ordering info */
1252: /* First create a row -> inode_size_array_index map */
1253: PetscCall(PetscMalloc1(n + 1, &nsmap));
1254: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1255: for (i = 0, row = 0; i < node_max; i++) {
1256: nodesz = tmp_vec1[i];
1257: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1258: }
1259: /* Using nsmap, create a reordered ns structure */
1260: for (i = 0, j = 0; i < node_max; i++) {
1261: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262: tmp_vec2[i] = nodesz;
1263: j += nodesz;
1264: }
1265: PetscCall(PetscFree(nsmap));
1266: PetscCall(PetscFree(tmp_vec1));
1268: /* Now use the correct ns */
1269: ns = tmp_vec2;
1271: do {
1272: sctx.newshift = PETSC_FALSE;
1273: /* Now loop over each block-row, and do the factorization */
1274: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1275: nodesz = ns[inod];
1277: switch (nodesz) {
1278: case 1:
1279: /* zero rtmp1 */
1280: /* L part */
1281: nz = bi[i + 1] - bi[i];
1282: bjtmp = bj + bi[i];
1283: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1285: /* U part */
1286: nz = bdiag[i] - bdiag[i + 1];
1287: bjtmp = bj + bdiag[i + 1] + 1;
1288: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290: /* load in initial (unfactored row) */
1291: nz = ai[r[i] + 1] - ai[r[i]];
1292: ajtmp = aj + ai[r[i]];
1293: v = aa + ai[r[i]];
1294: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1296: /* ZeropivotApply() */
1297: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1299: /* elimination */
1300: bjtmp = bj + bi[i];
1301: row = *bjtmp++;
1302: nzL = bi[i + 1] - bi[i];
1303: for (k = 0; k < nzL; k++) {
1304: pc = rtmp1 + row;
1305: if (*pc != 0.0) {
1306: pv = b->a + bdiag[row];
1307: mul1 = *pc * (*pv);
1308: *pc = mul1;
1309: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1310: pv = b->a + bdiag[row + 1] + 1;
1311: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1313: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1314: }
1315: row = *bjtmp++;
1316: }
1318: /* finished row so stick it into b->a */
1319: rs = 0.0;
1320: /* L part */
1321: pv = b->a + bi[i];
1322: pj = b->j + bi[i];
1323: nz = bi[i + 1] - bi[i];
1324: for (j = 0; j < nz; j++) {
1325: pv[j] = rtmp1[pj[j]];
1326: rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* U part */
1330: pv = b->a + bdiag[i + 1] + 1;
1331: pj = b->j + bdiag[i + 1] + 1;
1332: nz = bdiag[i] - bdiag[i + 1] - 1;
1333: for (j = 0; j < nz; j++) {
1334: pv[j] = rtmp1[pj[j]];
1335: rs += PetscAbsScalar(pv[j]);
1336: }
1338: /* Check zero pivot */
1339: sctx.rs = rs;
1340: sctx.pv = rtmp1[i];
1341: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1342: if (sctx.newshift) break;
1344: /* Mark diagonal and invert diagonal for simpler triangular solves */
1345: pv = b->a + bdiag[i];
1346: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347: break;
1349: case 2:
1350: /* zero rtmp1 and rtmp2 */
1351: /* L part */
1352: nz = bi[i + 1] - bi[i];
1353: bjtmp = bj + bi[i];
1354: for (j = 0; j < nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0;
1357: rtmp2[col] = 0.0;
1358: }
1360: /* U part */
1361: nz = bdiag[i] - bdiag[i + 1];
1362: bjtmp = bj + bdiag[i + 1] + 1;
1363: for (j = 0; j < nz; j++) {
1364: col = bjtmp[j];
1365: rtmp1[col] = 0.0;
1366: rtmp2[col] = 0.0;
1367: }
1369: /* load in initial (unfactored row) */
1370: nz = ai[r[i] + 1] - ai[r[i]];
1371: ajtmp = aj + ai[r[i]];
1372: v1 = aa + ai[r[i]];
1373: v2 = aa + ai[r[i] + 1];
1374: for (j = 0; j < nz; j++) {
1375: col = ics[ajtmp[j]];
1376: rtmp1[col] = v1[j];
1377: rtmp2[col] = v2[j];
1378: }
1379: /* ZeropivotApply(): shift the diagonal of the matrix */
1380: rtmp1[i] += sctx.shift_amount;
1381: rtmp2[i + 1] += sctx.shift_amount;
1383: /* elimination */
1384: bjtmp = bj + bi[i];
1385: row = *bjtmp++; /* pivot row */
1386: nzL = bi[i + 1] - bi[i];
1387: for (k = 0; k < nzL; k++) {
1388: pc1 = rtmp1 + row;
1389: pc2 = rtmp2 + row;
1390: if (*pc1 != 0.0 || *pc2 != 0.0) {
1391: pv = b->a + bdiag[row];
1392: mul1 = *pc1 * (*pv);
1393: mul2 = *pc2 * (*pv);
1394: *pc1 = mul1;
1395: *pc2 = mul2;
1397: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1398: pv = b->a + bdiag[row + 1] + 1;
1399: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1400: for (j = 0; j < nz; j++) {
1401: col = pj[j];
1402: rtmp1[col] -= mul1 * pv[j];
1403: rtmp2[col] -= mul2 * pv[j];
1404: }
1405: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1406: }
1407: row = *bjtmp++;
1408: }
1410: /* finished row i; check zero pivot, then stick row i into b->a */
1411: rs = 0.0;
1412: /* L part */
1413: pc1 = b->a + bi[i];
1414: pj = b->j + bi[i];
1415: nz = bi[i + 1] - bi[i];
1416: for (j = 0; j < nz; j++) {
1417: col = pj[j];
1418: pc1[j] = rtmp1[col];
1419: rs += PetscAbsScalar(pc1[j]);
1420: }
1421: /* U part */
1422: pc1 = b->a + bdiag[i + 1] + 1;
1423: pj = b->j + bdiag[i + 1] + 1;
1424: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1425: for (j = 0; j < nz; j++) {
1426: col = pj[j];
1427: pc1[j] = rtmp1[col];
1428: rs += PetscAbsScalar(pc1[j]);
1429: }
1431: sctx.rs = rs;
1432: sctx.pv = rtmp1[i];
1433: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1434: if (sctx.newshift) break;
1435: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1436: *pc1 = 1.0 / sctx.pv;
1438: /* Now take care of diagonal 2x2 block. */
1439: pc2 = rtmp2 + i;
1440: if (*pc2 != 0.0) {
1441: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1442: *pc2 = mul1; /* insert L entry */
1443: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1444: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445: for (j = 0; j < nz; j++) {
1446: col = pj[j];
1447: rtmp2[col] -= mul1 * rtmp1[col];
1448: }
1449: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450: }
1452: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453: rs = 0.0;
1454: /* L part */
1455: pc2 = b->a + bi[i + 1];
1456: pj = b->j + bi[i + 1];
1457: nz = bi[i + 2] - bi[i + 1];
1458: for (j = 0; j < nz; j++) {
1459: col = pj[j];
1460: pc2[j] = rtmp2[col];
1461: rs += PetscAbsScalar(pc2[j]);
1462: }
1463: /* U part */
1464: pc2 = b->a + bdiag[i + 2] + 1;
1465: pj = b->j + bdiag[i + 2] + 1;
1466: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467: for (j = 0; j < nz; j++) {
1468: col = pj[j];
1469: pc2[j] = rtmp2[col];
1470: rs += PetscAbsScalar(pc2[j]);
1471: }
1473: sctx.rs = rs;
1474: sctx.pv = rtmp2[i + 1];
1475: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1476: if (sctx.newshift) break;
1477: pc2 = b->a + bdiag[i + 1];
1478: *pc2 = 1.0 / sctx.pv;
1479: break;
1481: case 3:
1482: /* zero rtmp */
1483: /* L part */
1484: nz = bi[i + 1] - bi[i];
1485: bjtmp = bj + bi[i];
1486: for (j = 0; j < nz; j++) {
1487: col = bjtmp[j];
1488: rtmp1[col] = 0.0;
1489: rtmp2[col] = 0.0;
1490: rtmp3[col] = 0.0;
1491: }
1493: /* U part */
1494: nz = bdiag[i] - bdiag[i + 1];
1495: bjtmp = bj + bdiag[i + 1] + 1;
1496: for (j = 0; j < nz; j++) {
1497: col = bjtmp[j];
1498: rtmp1[col] = 0.0;
1499: rtmp2[col] = 0.0;
1500: rtmp3[col] = 0.0;
1501: }
1503: /* load in initial (unfactored row) */
1504: nz = ai[r[i] + 1] - ai[r[i]];
1505: ajtmp = aj + ai[r[i]];
1506: v1 = aa + ai[r[i]];
1507: v2 = aa + ai[r[i] + 1];
1508: v3 = aa + ai[r[i] + 2];
1509: for (j = 0; j < nz; j++) {
1510: col = ics[ajtmp[j]];
1511: rtmp1[col] = v1[j];
1512: rtmp2[col] = v2[j];
1513: rtmp3[col] = v3[j];
1514: }
1515: /* ZeropivotApply(): shift the diagonal of the matrix */
1516: rtmp1[i] += sctx.shift_amount;
1517: rtmp2[i + 1] += sctx.shift_amount;
1518: rtmp3[i + 2] += sctx.shift_amount;
1520: /* elimination */
1521: bjtmp = bj + bi[i];
1522: row = *bjtmp++; /* pivot row */
1523: nzL = bi[i + 1] - bi[i];
1524: for (k = 0; k < nzL; k++) {
1525: pc1 = rtmp1 + row;
1526: pc2 = rtmp2 + row;
1527: pc3 = rtmp3 + row;
1528: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1529: pv = b->a + bdiag[row];
1530: mul1 = *pc1 * (*pv);
1531: mul2 = *pc2 * (*pv);
1532: mul3 = *pc3 * (*pv);
1533: *pc1 = mul1;
1534: *pc2 = mul2;
1535: *pc3 = mul3;
1537: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1538: pv = b->a + bdiag[row + 1] + 1;
1539: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1540: for (j = 0; j < nz; j++) {
1541: col = pj[j];
1542: rtmp1[col] -= mul1 * pv[j];
1543: rtmp2[col] -= mul2 * pv[j];
1544: rtmp3[col] -= mul3 * pv[j];
1545: }
1546: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1547: }
1548: row = *bjtmp++;
1549: }
1551: /* finished row i; check zero pivot, then stick row i into b->a */
1552: rs = 0.0;
1553: /* L part */
1554: pc1 = b->a + bi[i];
1555: pj = b->j + bi[i];
1556: nz = bi[i + 1] - bi[i];
1557: for (j = 0; j < nz; j++) {
1558: col = pj[j];
1559: pc1[j] = rtmp1[col];
1560: rs += PetscAbsScalar(pc1[j]);
1561: }
1562: /* U part */
1563: pc1 = b->a + bdiag[i + 1] + 1;
1564: pj = b->j + bdiag[i + 1] + 1;
1565: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566: for (j = 0; j < nz; j++) {
1567: col = pj[j];
1568: pc1[j] = rtmp1[col];
1569: rs += PetscAbsScalar(pc1[j]);
1570: }
1572: sctx.rs = rs;
1573: sctx.pv = rtmp1[i];
1574: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1575: if (sctx.newshift) break;
1576: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1577: *pc1 = 1.0 / sctx.pv;
1579: /* Now take care of 1st column of diagonal 3x3 block. */
1580: pc2 = rtmp2 + i;
1581: pc3 = rtmp3 + i;
1582: if (*pc2 != 0.0 || *pc3 != 0.0) {
1583: mul2 = (*pc2) * (*pc1);
1584: *pc2 = mul2;
1585: mul3 = (*pc3) * (*pc1);
1586: *pc3 = mul3;
1587: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1588: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1589: for (j = 0; j < nz; j++) {
1590: col = pj[j];
1591: rtmp2[col] -= mul2 * rtmp1[col];
1592: rtmp3[col] -= mul3 * rtmp1[col];
1593: }
1594: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1595: }
1597: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598: rs = 0.0;
1599: /* L part */
1600: pc2 = b->a + bi[i + 1];
1601: pj = b->j + bi[i + 1];
1602: nz = bi[i + 2] - bi[i + 1];
1603: for (j = 0; j < nz; j++) {
1604: col = pj[j];
1605: pc2[j] = rtmp2[col];
1606: rs += PetscAbsScalar(pc2[j]);
1607: }
1608: /* U part */
1609: pc2 = b->a + bdiag[i + 2] + 1;
1610: pj = b->j + bdiag[i + 2] + 1;
1611: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612: for (j = 0; j < nz; j++) {
1613: col = pj[j];
1614: pc2[j] = rtmp2[col];
1615: rs += PetscAbsScalar(pc2[j]);
1616: }
1618: sctx.rs = rs;
1619: sctx.pv = rtmp2[i + 1];
1620: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1621: if (sctx.newshift) break;
1622: pc2 = b->a + bdiag[i + 1];
1623: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1625: /* Now take care of 2nd column of diagonal 3x3 block. */
1626: pc3 = rtmp3 + i + 1;
1627: if (*pc3 != 0.0) {
1628: mul3 = (*pc3) * (*pc2);
1629: *pc3 = mul3;
1630: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1631: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1632: for (j = 0; j < nz; j++) {
1633: col = pj[j];
1634: rtmp3[col] -= mul3 * rtmp2[col];
1635: }
1636: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1637: }
1639: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1640: rs = 0.0;
1641: /* L part */
1642: pc3 = b->a + bi[i + 2];
1643: pj = b->j + bi[i + 2];
1644: nz = bi[i + 3] - bi[i + 2];
1645: for (j = 0; j < nz; j++) {
1646: col = pj[j];
1647: pc3[j] = rtmp3[col];
1648: rs += PetscAbsScalar(pc3[j]);
1649: }
1650: /* U part */
1651: pc3 = b->a + bdiag[i + 3] + 1;
1652: pj = b->j + bdiag[i + 3] + 1;
1653: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1654: for (j = 0; j < nz; j++) {
1655: col = pj[j];
1656: pc3[j] = rtmp3[col];
1657: rs += PetscAbsScalar(pc3[j]);
1658: }
1660: sctx.rs = rs;
1661: sctx.pv = rtmp3[i + 2];
1662: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1663: if (sctx.newshift) break;
1664: pc3 = b->a + bdiag[i + 2];
1665: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1666: break;
1667: case 4:
1668: /* zero rtmp */
1669: /* L part */
1670: nz = bi[i + 1] - bi[i];
1671: bjtmp = bj + bi[i];
1672: for (j = 0; j < nz; j++) {
1673: col = bjtmp[j];
1674: rtmp1[col] = 0.0;
1675: rtmp2[col] = 0.0;
1676: rtmp3[col] = 0.0;
1677: rtmp4[col] = 0.0;
1678: }
1680: /* U part */
1681: nz = bdiag[i] - bdiag[i + 1];
1682: bjtmp = bj + bdiag[i + 1] + 1;
1683: for (j = 0; j < nz; j++) {
1684: col = bjtmp[j];
1685: rtmp1[col] = 0.0;
1686: rtmp2[col] = 0.0;
1687: rtmp3[col] = 0.0;
1688: rtmp4[col] = 0.0;
1689: }
1691: /* load in initial (unfactored row) */
1692: nz = ai[r[i] + 1] - ai[r[i]];
1693: ajtmp = aj + ai[r[i]];
1694: v1 = aa + ai[r[i]];
1695: v2 = aa + ai[r[i] + 1];
1696: v3 = aa + ai[r[i] + 2];
1697: v4 = aa + ai[r[i] + 3];
1698: for (j = 0; j < nz; j++) {
1699: col = ics[ajtmp[j]];
1700: rtmp1[col] = v1[j];
1701: rtmp2[col] = v2[j];
1702: rtmp3[col] = v3[j];
1703: rtmp4[col] = v4[j];
1704: }
1705: /* ZeropivotApply(): shift the diagonal of the matrix */
1706: rtmp1[i] += sctx.shift_amount;
1707: rtmp2[i + 1] += sctx.shift_amount;
1708: rtmp3[i + 2] += sctx.shift_amount;
1709: rtmp4[i + 3] += sctx.shift_amount;
1711: /* elimination */
1712: bjtmp = bj + bi[i];
1713: row = *bjtmp++; /* pivot row */
1714: nzL = bi[i + 1] - bi[i];
1715: for (k = 0; k < nzL; k++) {
1716: pc1 = rtmp1 + row;
1717: pc2 = rtmp2 + row;
1718: pc3 = rtmp3 + row;
1719: pc4 = rtmp4 + row;
1720: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721: pv = b->a + bdiag[row];
1722: mul1 = *pc1 * (*pv);
1723: mul2 = *pc2 * (*pv);
1724: mul3 = *pc3 * (*pv);
1725: mul4 = *pc4 * (*pv);
1726: *pc1 = mul1;
1727: *pc2 = mul2;
1728: *pc3 = mul3;
1729: *pc4 = mul4;
1731: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1732: pv = b->a + bdiag[row + 1] + 1;
1733: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1734: for (j = 0; j < nz; j++) {
1735: col = pj[j];
1736: rtmp1[col] -= mul1 * pv[j];
1737: rtmp2[col] -= mul2 * pv[j];
1738: rtmp3[col] -= mul3 * pv[j];
1739: rtmp4[col] -= mul4 * pv[j];
1740: }
1741: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1742: }
1743: row = *bjtmp++;
1744: }
1746: /* finished row i; check zero pivot, then stick row i into b->a */
1747: rs = 0.0;
1748: /* L part */
1749: pc1 = b->a + bi[i];
1750: pj = b->j + bi[i];
1751: nz = bi[i + 1] - bi[i];
1752: for (j = 0; j < nz; j++) {
1753: col = pj[j];
1754: pc1[j] = rtmp1[col];
1755: rs += PetscAbsScalar(pc1[j]);
1756: }
1757: /* U part */
1758: pc1 = b->a + bdiag[i + 1] + 1;
1759: pj = b->j + bdiag[i + 1] + 1;
1760: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1761: for (j = 0; j < nz; j++) {
1762: col = pj[j];
1763: pc1[j] = rtmp1[col];
1764: rs += PetscAbsScalar(pc1[j]);
1765: }
1767: sctx.rs = rs;
1768: sctx.pv = rtmp1[i];
1769: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1770: if (sctx.newshift) break;
1771: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1772: *pc1 = 1.0 / sctx.pv;
1774: /* Now take care of 1st column of diagonal 4x4 block. */
1775: pc2 = rtmp2 + i;
1776: pc3 = rtmp3 + i;
1777: pc4 = rtmp4 + i;
1778: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1779: mul2 = (*pc2) * (*pc1);
1780: *pc2 = mul2;
1781: mul3 = (*pc3) * (*pc1);
1782: *pc3 = mul3;
1783: mul4 = (*pc4) * (*pc1);
1784: *pc4 = mul4;
1785: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1786: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1787: for (j = 0; j < nz; j++) {
1788: col = pj[j];
1789: rtmp2[col] -= mul2 * rtmp1[col];
1790: rtmp3[col] -= mul3 * rtmp1[col];
1791: rtmp4[col] -= mul4 * rtmp1[col];
1792: }
1793: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1794: }
1796: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1797: rs = 0.0;
1798: /* L part */
1799: pc2 = b->a + bi[i + 1];
1800: pj = b->j + bi[i + 1];
1801: nz = bi[i + 2] - bi[i + 1];
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: pc2[j] = rtmp2[col];
1805: rs += PetscAbsScalar(pc2[j]);
1806: }
1807: /* U part */
1808: pc2 = b->a + bdiag[i + 2] + 1;
1809: pj = b->j + bdiag[i + 2] + 1;
1810: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1811: for (j = 0; j < nz; j++) {
1812: col = pj[j];
1813: pc2[j] = rtmp2[col];
1814: rs += PetscAbsScalar(pc2[j]);
1815: }
1817: sctx.rs = rs;
1818: sctx.pv = rtmp2[i + 1];
1819: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1820: if (sctx.newshift) break;
1821: pc2 = b->a + bdiag[i + 1];
1822: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1824: /* Now take care of 2nd column of diagonal 4x4 block. */
1825: pc3 = rtmp3 + i + 1;
1826: pc4 = rtmp4 + i + 1;
1827: if (*pc3 != 0.0 || *pc4 != 0.0) {
1828: mul3 = (*pc3) * (*pc2);
1829: *pc3 = mul3;
1830: mul4 = (*pc4) * (*pc2);
1831: *pc4 = mul4;
1832: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1833: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1834: for (j = 0; j < nz; j++) {
1835: col = pj[j];
1836: rtmp3[col] -= mul3 * rtmp2[col];
1837: rtmp4[col] -= mul4 * rtmp2[col];
1838: }
1839: PetscCall(PetscLogFlops(4.0 * nz));
1840: }
1842: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1843: rs = 0.0;
1844: /* L part */
1845: pc3 = b->a + bi[i + 2];
1846: pj = b->j + bi[i + 2];
1847: nz = bi[i + 3] - bi[i + 2];
1848: for (j = 0; j < nz; j++) {
1849: col = pj[j];
1850: pc3[j] = rtmp3[col];
1851: rs += PetscAbsScalar(pc3[j]);
1852: }
1853: /* U part */
1854: pc3 = b->a + bdiag[i + 3] + 1;
1855: pj = b->j + bdiag[i + 3] + 1;
1856: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1857: for (j = 0; j < nz; j++) {
1858: col = pj[j];
1859: pc3[j] = rtmp3[col];
1860: rs += PetscAbsScalar(pc3[j]);
1861: }
1863: sctx.rs = rs;
1864: sctx.pv = rtmp3[i + 2];
1865: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1866: if (sctx.newshift) break;
1867: pc3 = b->a + bdiag[i + 2];
1868: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1870: /* Now take care of 3rd column of diagonal 4x4 block. */
1871: pc4 = rtmp4 + i + 2;
1872: if (*pc4 != 0.0) {
1873: mul4 = (*pc4) * (*pc3);
1874: *pc4 = mul4;
1875: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1876: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1877: for (j = 0; j < nz; j++) {
1878: col = pj[j];
1879: rtmp4[col] -= mul4 * rtmp3[col];
1880: }
1881: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1882: }
1884: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1885: rs = 0.0;
1886: /* L part */
1887: pc4 = b->a + bi[i + 3];
1888: pj = b->j + bi[i + 3];
1889: nz = bi[i + 4] - bi[i + 3];
1890: for (j = 0; j < nz; j++) {
1891: col = pj[j];
1892: pc4[j] = rtmp4[col];
1893: rs += PetscAbsScalar(pc4[j]);
1894: }
1895: /* U part */
1896: pc4 = b->a + bdiag[i + 4] + 1;
1897: pj = b->j + bdiag[i + 4] + 1;
1898: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1899: for (j = 0; j < nz; j++) {
1900: col = pj[j];
1901: pc4[j] = rtmp4[col];
1902: rs += PetscAbsScalar(pc4[j]);
1903: }
1905: sctx.rs = rs;
1906: sctx.pv = rtmp4[i + 3];
1907: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1908: if (sctx.newshift) break;
1909: pc4 = b->a + bdiag[i + 3];
1910: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1911: break;
1913: default:
1914: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1915: }
1916: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1917: i += nodesz; /* Update the row */
1918: }
1920: /* MatPivotRefine() */
1921: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1922: /*
1923: * if no shift in this attempt & shifting & started shifting & can refine,
1924: * then try lower shift
1925: */
1926: sctx.shift_hi = sctx.shift_fraction;
1927: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1928: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1929: sctx.newshift = PETSC_TRUE;
1930: sctx.nshift++;
1931: }
1932: } while (sctx.newshift);
1934: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1935: PetscCall(PetscFree(tmp_vec2));
1936: PetscCall(ISRestoreIndices(isicol, &ic));
1937: PetscCall(ISRestoreIndices(isrow, &r));
1939: if (b->inode.size) {
1940: C->ops->solve = MatSolve_SeqAIJ_Inode;
1941: } else {
1942: C->ops->solve = MatSolve_SeqAIJ;
1943: }
1944: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1945: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1946: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1947: C->ops->matsolve = MatMatSolve_SeqAIJ;
1948: C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1949: C->assembled = PETSC_TRUE;
1950: C->preallocated = PETSC_TRUE;
1952: PetscCall(PetscLogFlops(C->cmap->n));
1954: /* MatShiftView(A,info,&sctx) */
1955: if (sctx.nshift) {
1956: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1957: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1958: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1959: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1960: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1961: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1962: }
1963: }
1964: PetscFunctionReturn(PETSC_SUCCESS);
1965: }
1967: #if 0
1968: // unused
1969: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1970: {
1971: Mat C = B;
1972: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1973: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1974: const PetscInt *r, *ic, *c, *ics;
1975: PetscInt n = A->rmap->n, *bi = b->i;
1976: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1977: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1978: PetscInt *ai = a->i, *aj = a->j;
1979: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1980: PetscScalar mul1, mul2, mul3, tmp;
1981: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1982: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1983: PetscReal rs = 0.0;
1984: FactorShiftCtx sctx;
1986: PetscFunctionBegin;
1987: sctx.shift_top = 0;
1988: sctx.nshift_max = 0;
1989: sctx.shift_lo = 0;
1990: sctx.shift_hi = 0;
1991: sctx.shift_fraction = 0;
1993: /* if both shift schemes are chosen by user, only use info->shiftpd */
1994: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1995: sctx.shift_top = 0;
1996: for (i = 0; i < n; i++) {
1997: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1998: rs = 0.0;
1999: ajtmp = aj + ai[i];
2000: rtmp1 = aa + ai[i];
2001: nz = ai[i + 1] - ai[i];
2002: for (j = 0; j < nz; j++) {
2003: if (*ajtmp != i) {
2004: rs += PetscAbsScalar(*rtmp1++);
2005: } else {
2006: rs -= PetscRealPart(*rtmp1++);
2007: }
2008: ajtmp++;
2009: }
2010: if (rs > sctx.shift_top) sctx.shift_top = rs;
2011: }
2012: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2013: sctx.shift_top *= 1.1;
2014: sctx.nshift_max = 5;
2015: sctx.shift_lo = 0.;
2016: sctx.shift_hi = 1.;
2017: }
2018: sctx.shift_amount = 0;
2019: sctx.nshift = 0;
2021: PetscCall(ISGetIndices(isrow, &r));
2022: PetscCall(ISGetIndices(iscol, &c));
2023: PetscCall(ISGetIndices(isicol, &ic));
2024: PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2025: ics = ic;
2027: node_max = a->inode.node_count;
2028: ns = a->inode.size;
2029: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2031: /* If max inode size > 3, split it into two inodes.*/
2032: /* also map the inode sizes according to the ordering */
2033: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2034: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2035: if (ns[i] > 3) {
2036: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2037: ++j;
2038: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2039: } else {
2040: tmp_vec1[j] = ns[i];
2041: }
2042: }
2043: /* Use the correct node_max */
2044: node_max = j;
2046: /* Now reorder the inode info based on mat re-ordering info */
2047: /* First create a row -> inode_size_array_index map */
2048: PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2049: for (i = 0, row = 0; i < node_max; i++) {
2050: nodesz = tmp_vec1[i];
2051: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2052: }
2053: /* Using nsmap, create a reordered ns structure */
2054: for (i = 0, j = 0; i < node_max; i++) {
2055: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2056: tmp_vec2[i] = nodesz;
2057: j += nodesz;
2058: }
2059: PetscCall(PetscFree2(nsmap, tmp_vec1));
2060: /* Now use the correct ns */
2061: ns = tmp_vec2;
2063: do {
2064: sctx.newshift = PETSC_FALSE;
2065: /* Now loop over each block-row, and do the factorization */
2066: for (i = 0, row = 0; i < node_max; i++) {
2067: nodesz = ns[i];
2068: nz = bi[row + 1] - bi[row];
2069: bjtmp = bj + bi[row];
2071: switch (nodesz) {
2072: case 1:
2073: for (j = 0; j < nz; j++) {
2074: idx = bjtmp[j];
2075: rtmp11[idx] = 0.0;
2076: }
2078: /* load in initial (unfactored row) */
2079: idx = r[row];
2080: nz_tmp = ai[idx + 1] - ai[idx];
2081: ajtmp = aj + ai[idx];
2082: v1 = aa + ai[idx];
2084: for (j = 0; j < nz_tmp; j++) {
2085: idx = ics[ajtmp[j]];
2086: rtmp11[idx] = v1[j];
2087: }
2088: rtmp11[ics[r[row]]] += sctx.shift_amount;
2090: prow = *bjtmp++;
2091: while (prow < row) {
2092: pc1 = rtmp11 + prow;
2093: if (*pc1 != 0.0) {
2094: pv = ba + bd[prow];
2095: pj = nbj + bd[prow];
2096: mul1 = *pc1 * *pv++;
2097: *pc1 = mul1;
2098: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2099: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2100: for (j = 0; j < nz_tmp; j++) {
2101: tmp = pv[j];
2102: idx = pj[j];
2103: rtmp11[idx] -= mul1 * tmp;
2104: }
2105: }
2106: prow = *bjtmp++;
2107: }
2108: pj = bj + bi[row];
2109: pc1 = ba + bi[row];
2111: sctx.pv = rtmp11[row];
2112: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2113: rs = 0.0;
2114: for (j = 0; j < nz; j++) {
2115: idx = pj[j];
2116: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2117: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2118: }
2119: sctx.rs = rs;
2120: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2121: if (sctx.newshift) goto endofwhile;
2122: break;
2124: case 2:
2125: for (j = 0; j < nz; j++) {
2126: idx = bjtmp[j];
2127: rtmp11[idx] = 0.0;
2128: rtmp22[idx] = 0.0;
2129: }
2131: /* load in initial (unfactored row) */
2132: idx = r[row];
2133: nz_tmp = ai[idx + 1] - ai[idx];
2134: ajtmp = aj + ai[idx];
2135: v1 = aa + ai[idx];
2136: v2 = aa + ai[idx + 1];
2137: for (j = 0; j < nz_tmp; j++) {
2138: idx = ics[ajtmp[j]];
2139: rtmp11[idx] = v1[j];
2140: rtmp22[idx] = v2[j];
2141: }
2142: rtmp11[ics[r[row]]] += sctx.shift_amount;
2143: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2145: prow = *bjtmp++;
2146: while (prow < row) {
2147: pc1 = rtmp11 + prow;
2148: pc2 = rtmp22 + prow;
2149: if (*pc1 != 0.0 || *pc2 != 0.0) {
2150: pv = ba + bd[prow];
2151: pj = nbj + bd[prow];
2152: mul1 = *pc1 * *pv;
2153: mul2 = *pc2 * *pv;
2154: ++pv;
2155: *pc1 = mul1;
2156: *pc2 = mul2;
2158: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2159: for (j = 0; j < nz_tmp; j++) {
2160: tmp = pv[j];
2161: idx = pj[j];
2162: rtmp11[idx] -= mul1 * tmp;
2163: rtmp22[idx] -= mul2 * tmp;
2164: }
2165: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2166: }
2167: prow = *bjtmp++;
2168: }
2170: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2171: pc1 = rtmp11 + prow;
2172: pc2 = rtmp22 + prow;
2174: sctx.pv = *pc1;
2175: pj = bj + bi[prow];
2176: rs = 0.0;
2177: for (j = 0; j < nz; j++) {
2178: idx = pj[j];
2179: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2180: }
2181: sctx.rs = rs;
2182: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2183: if (sctx.newshift) goto endofwhile;
2185: if (*pc2 != 0.0) {
2186: pj = nbj + bd[prow];
2187: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2188: *pc2 = mul2;
2189: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2190: for (j = 0; j < nz_tmp; j++) {
2191: idx = pj[j];
2192: tmp = rtmp11[idx];
2193: rtmp22[idx] -= mul2 * tmp;
2194: }
2195: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2196: }
2198: pj = bj + bi[row];
2199: pc1 = ba + bi[row];
2200: pc2 = ba + bi[row + 1];
2202: sctx.pv = rtmp22[row + 1];
2203: rs = 0.0;
2204: rtmp11[row] = 1.0 / rtmp11[row];
2205: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2206: /* copy row entries from dense representation to sparse */
2207: for (j = 0; j < nz; j++) {
2208: idx = pj[j];
2209: pc1[j] = rtmp11[idx];
2210: pc2[j] = rtmp22[idx];
2211: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2212: }
2213: sctx.rs = rs;
2214: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2215: if (sctx.newshift) goto endofwhile;
2216: break;
2218: case 3:
2219: for (j = 0; j < nz; j++) {
2220: idx = bjtmp[j];
2221: rtmp11[idx] = 0.0;
2222: rtmp22[idx] = 0.0;
2223: rtmp33[idx] = 0.0;
2224: }
2225: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2226: idx = r[row];
2227: nz_tmp = ai[idx + 1] - ai[idx];
2228: ajtmp = aj + ai[idx];
2229: v1 = aa + ai[idx];
2230: v2 = aa + ai[idx + 1];
2231: v3 = aa + ai[idx + 2];
2232: for (j = 0; j < nz_tmp; j++) {
2233: idx = ics[ajtmp[j]];
2234: rtmp11[idx] = v1[j];
2235: rtmp22[idx] = v2[j];
2236: rtmp33[idx] = v3[j];
2237: }
2238: rtmp11[ics[r[row]]] += sctx.shift_amount;
2239: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2240: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2242: /* loop over all pivot row blocks above this row block */
2243: prow = *bjtmp++;
2244: while (prow < row) {
2245: pc1 = rtmp11 + prow;
2246: pc2 = rtmp22 + prow;
2247: pc3 = rtmp33 + prow;
2248: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2249: pv = ba + bd[prow];
2250: pj = nbj + bd[prow];
2251: mul1 = *pc1 * *pv;
2252: mul2 = *pc2 * *pv;
2253: mul3 = *pc3 * *pv;
2254: ++pv;
2255: *pc1 = mul1;
2256: *pc2 = mul2;
2257: *pc3 = mul3;
2259: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2260: /* update this row based on pivot row */
2261: for (j = 0; j < nz_tmp; j++) {
2262: tmp = pv[j];
2263: idx = pj[j];
2264: rtmp11[idx] -= mul1 * tmp;
2265: rtmp22[idx] -= mul2 * tmp;
2266: rtmp33[idx] -= mul3 * tmp;
2267: }
2268: PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2269: }
2270: prow = *bjtmp++;
2271: }
2273: /* Now take care of diagonal 3x3 block in this set of rows */
2274: /* note: prow = row here */
2275: pc1 = rtmp11 + prow;
2276: pc2 = rtmp22 + prow;
2277: pc3 = rtmp33 + prow;
2279: sctx.pv = *pc1;
2280: pj = bj + bi[prow];
2281: rs = 0.0;
2282: for (j = 0; j < nz; j++) {
2283: idx = pj[j];
2284: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2285: }
2286: sctx.rs = rs;
2287: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2288: if (sctx.newshift) goto endofwhile;
2290: if (*pc2 != 0.0 || *pc3 != 0.0) {
2291: mul2 = (*pc2) / (*pc1);
2292: mul3 = (*pc3) / (*pc1);
2293: *pc2 = mul2;
2294: *pc3 = mul3;
2295: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2296: pj = nbj + bd[prow];
2297: for (j = 0; j < nz_tmp; j++) {
2298: idx = pj[j];
2299: tmp = rtmp11[idx];
2300: rtmp22[idx] -= mul2 * tmp;
2301: rtmp33[idx] -= mul3 * tmp;
2302: }
2303: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2304: }
2305: ++prow;
2307: pc2 = rtmp22 + prow;
2308: pc3 = rtmp33 + prow;
2309: sctx.pv = *pc2;
2310: pj = bj + bi[prow];
2311: rs = 0.0;
2312: for (j = 0; j < nz; j++) {
2313: idx = pj[j];
2314: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2315: }
2316: sctx.rs = rs;
2317: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2318: if (sctx.newshift) goto endofwhile;
2320: if (*pc3 != 0.0) {
2321: mul3 = (*pc3) / (*pc2);
2322: *pc3 = mul3;
2323: pj = nbj + bd[prow];
2324: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2325: for (j = 0; j < nz_tmp; j++) {
2326: idx = pj[j];
2327: tmp = rtmp22[idx];
2328: rtmp33[idx] -= mul3 * tmp;
2329: }
2330: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2331: }
2333: pj = bj + bi[row];
2334: pc1 = ba + bi[row];
2335: pc2 = ba + bi[row + 1];
2336: pc3 = ba + bi[row + 2];
2338: sctx.pv = rtmp33[row + 2];
2339: rs = 0.0;
2340: rtmp11[row] = 1.0 / rtmp11[row];
2341: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2342: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2343: /* copy row entries from dense representation to sparse */
2344: for (j = 0; j < nz; j++) {
2345: idx = pj[j];
2346: pc1[j] = rtmp11[idx];
2347: pc2[j] = rtmp22[idx];
2348: pc3[j] = rtmp33[idx];
2349: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2350: }
2352: sctx.rs = rs;
2353: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2354: if (sctx.newshift) goto endofwhile;
2355: break;
2357: default:
2358: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2359: }
2360: row += nodesz; /* Update the row */
2361: }
2362: endofwhile:;
2363: } while (sctx.newshift);
2364: PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2365: PetscCall(PetscFree(tmp_vec2));
2366: PetscCall(ISRestoreIndices(isicol, &ic));
2367: PetscCall(ISRestoreIndices(isrow, &r));
2368: PetscCall(ISRestoreIndices(iscol, &c));
2370: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2371: /* do not set solve add, since MatSolve_Inode + Add is faster */
2372: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2373: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2374: C->assembled = PETSC_TRUE;
2375: C->preallocated = PETSC_TRUE;
2376: if (sctx.nshift) {
2377: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2378: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2379: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2380: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2381: }
2382: }
2383: PetscCall(PetscLogFlops(C->cmap->n));
2384: PetscCall(MatSeqAIJCheckInode(C));
2385: PetscFunctionReturn(PETSC_SUCCESS);
2386: }
2387: #endif
2389: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2390: {
2391: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2392: IS iscol = a->col, isrow = a->row;
2393: const PetscInt *r, *c, *rout, *cout;
2394: PetscInt i, j, n = A->rmap->n;
2395: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2396: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2397: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2398: PetscScalar sum1, sum2, sum3, sum4, sum5;
2399: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2400: const PetscScalar *b;
2402: PetscFunctionBegin;
2403: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2404: node_max = a->inode.node_count;
2405: ns = a->inode.size; /* Node Size array */
2407: PetscCall(VecGetArrayRead(bb, &b));
2408: PetscCall(VecGetArrayWrite(xx, &x));
2409: tmp = a->solve_work;
2411: PetscCall(ISGetIndices(isrow, &rout));
2412: r = rout;
2413: PetscCall(ISGetIndices(iscol, &cout));
2414: c = cout;
2416: /* forward solve the lower triangular */
2417: tmps = tmp;
2418: aa = a_a;
2419: aj = a_j;
2420: ad = a->diag;
2422: for (i = 0, row = 0; i < node_max; ++i) {
2423: nsz = ns[i];
2424: aii = ai[row];
2425: v1 = aa + aii;
2426: vi = aj + aii;
2427: nz = ai[row + 1] - ai[row];
2429: if (i < node_max - 1) {
2430: /* Prefetch the indices for the next block */
2431: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2432: /* Prefetch the data for the next block */
2433: PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2434: }
2436: switch (nsz) { /* Each loop in 'case' is unrolled */
2437: case 1:
2438: sum1 = b[r[row]];
2439: for (j = 0; j < nz - 1; j += 2) {
2440: i0 = vi[j];
2441: i1 = vi[j + 1];
2442: tmp0 = tmps[i0];
2443: tmp1 = tmps[i1];
2444: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2445: }
2446: if (j == nz - 1) {
2447: tmp0 = tmps[vi[j]];
2448: sum1 -= v1[j] * tmp0;
2449: }
2450: tmp[row++] = sum1;
2451: break;
2452: case 2:
2453: sum1 = b[r[row]];
2454: sum2 = b[r[row + 1]];
2455: v2 = aa + ai[row + 1];
2457: for (j = 0; j < nz - 1; j += 2) {
2458: i0 = vi[j];
2459: i1 = vi[j + 1];
2460: tmp0 = tmps[i0];
2461: tmp1 = tmps[i1];
2462: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2463: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2464: }
2465: if (j == nz - 1) {
2466: tmp0 = tmps[vi[j]];
2467: sum1 -= v1[j] * tmp0;
2468: sum2 -= v2[j] * tmp0;
2469: }
2470: sum2 -= v2[nz] * sum1;
2471: tmp[row++] = sum1;
2472: tmp[row++] = sum2;
2473: break;
2474: case 3:
2475: sum1 = b[r[row]];
2476: sum2 = b[r[row + 1]];
2477: sum3 = b[r[row + 2]];
2478: v2 = aa + ai[row + 1];
2479: v3 = aa + ai[row + 2];
2481: for (j = 0; j < nz - 1; j += 2) {
2482: i0 = vi[j];
2483: i1 = vi[j + 1];
2484: tmp0 = tmps[i0];
2485: tmp1 = tmps[i1];
2486: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2487: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2488: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2489: }
2490: if (j == nz - 1) {
2491: tmp0 = tmps[vi[j]];
2492: sum1 -= v1[j] * tmp0;
2493: sum2 -= v2[j] * tmp0;
2494: sum3 -= v3[j] * tmp0;
2495: }
2496: sum2 -= v2[nz] * sum1;
2497: sum3 -= v3[nz] * sum1;
2498: sum3 -= v3[nz + 1] * sum2;
2499: tmp[row++] = sum1;
2500: tmp[row++] = sum2;
2501: tmp[row++] = sum3;
2502: break;
2504: case 4:
2505: sum1 = b[r[row]];
2506: sum2 = b[r[row + 1]];
2507: sum3 = b[r[row + 2]];
2508: sum4 = b[r[row + 3]];
2509: v2 = aa + ai[row + 1];
2510: v3 = aa + ai[row + 2];
2511: v4 = aa + ai[row + 3];
2513: for (j = 0; j < nz - 1; j += 2) {
2514: i0 = vi[j];
2515: i1 = vi[j + 1];
2516: tmp0 = tmps[i0];
2517: tmp1 = tmps[i1];
2518: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2519: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2520: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2521: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2522: }
2523: if (j == nz - 1) {
2524: tmp0 = tmps[vi[j]];
2525: sum1 -= v1[j] * tmp0;
2526: sum2 -= v2[j] * tmp0;
2527: sum3 -= v3[j] * tmp0;
2528: sum4 -= v4[j] * tmp0;
2529: }
2530: sum2 -= v2[nz] * sum1;
2531: sum3 -= v3[nz] * sum1;
2532: sum4 -= v4[nz] * sum1;
2533: sum3 -= v3[nz + 1] * sum2;
2534: sum4 -= v4[nz + 1] * sum2;
2535: sum4 -= v4[nz + 2] * sum3;
2537: tmp[row++] = sum1;
2538: tmp[row++] = sum2;
2539: tmp[row++] = sum3;
2540: tmp[row++] = sum4;
2541: break;
2542: case 5:
2543: sum1 = b[r[row]];
2544: sum2 = b[r[row + 1]];
2545: sum3 = b[r[row + 2]];
2546: sum4 = b[r[row + 3]];
2547: sum5 = b[r[row + 4]];
2548: v2 = aa + ai[row + 1];
2549: v3 = aa + ai[row + 2];
2550: v4 = aa + ai[row + 3];
2551: v5 = aa + ai[row + 4];
2553: for (j = 0; j < nz - 1; j += 2) {
2554: i0 = vi[j];
2555: i1 = vi[j + 1];
2556: tmp0 = tmps[i0];
2557: tmp1 = tmps[i1];
2558: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2559: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2560: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2561: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2562: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2563: }
2564: if (j == nz - 1) {
2565: tmp0 = tmps[vi[j]];
2566: sum1 -= v1[j] * tmp0;
2567: sum2 -= v2[j] * tmp0;
2568: sum3 -= v3[j] * tmp0;
2569: sum4 -= v4[j] * tmp0;
2570: sum5 -= v5[j] * tmp0;
2571: }
2573: sum2 -= v2[nz] * sum1;
2574: sum3 -= v3[nz] * sum1;
2575: sum4 -= v4[nz] * sum1;
2576: sum5 -= v5[nz] * sum1;
2577: sum3 -= v3[nz + 1] * sum2;
2578: sum4 -= v4[nz + 1] * sum2;
2579: sum5 -= v5[nz + 1] * sum2;
2580: sum4 -= v4[nz + 2] * sum3;
2581: sum5 -= v5[nz + 2] * sum3;
2582: sum5 -= v5[nz + 3] * sum4;
2584: tmp[row++] = sum1;
2585: tmp[row++] = sum2;
2586: tmp[row++] = sum3;
2587: tmp[row++] = sum4;
2588: tmp[row++] = sum5;
2589: break;
2590: default:
2591: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2592: }
2593: }
2594: /* backward solve the upper triangular */
2595: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2596: nsz = ns[i];
2597: aii = ad[row + 1] + 1;
2598: v1 = aa + aii;
2599: vi = aj + aii;
2600: nz = ad[row] - ad[row + 1] - 1;
2602: if (i > 0) {
2603: /* Prefetch the indices for the next block */
2604: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2605: /* Prefetch the data for the next block */
2606: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2607: }
2609: switch (nsz) { /* Each loop in 'case' is unrolled */
2610: case 1:
2611: sum1 = tmp[row];
2613: for (j = 0; j < nz - 1; j += 2) {
2614: i0 = vi[j];
2615: i1 = vi[j + 1];
2616: tmp0 = tmps[i0];
2617: tmp1 = tmps[i1];
2618: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2619: }
2620: if (j == nz - 1) {
2621: tmp0 = tmps[vi[j]];
2622: sum1 -= v1[j] * tmp0;
2623: }
2624: x[c[row]] = tmp[row] = sum1 * v1[nz];
2625: row--;
2626: break;
2627: case 2:
2628: sum1 = tmp[row];
2629: sum2 = tmp[row - 1];
2630: v2 = aa + ad[row] + 1;
2631: for (j = 0; j < nz - 1; j += 2) {
2632: i0 = vi[j];
2633: i1 = vi[j + 1];
2634: tmp0 = tmps[i0];
2635: tmp1 = tmps[i1];
2636: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2637: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2638: }
2639: if (j == nz - 1) {
2640: tmp0 = tmps[vi[j]];
2641: sum1 -= v1[j] * tmp0;
2642: sum2 -= v2[j + 1] * tmp0;
2643: }
2645: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2646: row--;
2647: sum2 -= v2[0] * tmp0;
2648: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2649: row--;
2650: break;
2651: case 3:
2652: sum1 = tmp[row];
2653: sum2 = tmp[row - 1];
2654: sum3 = tmp[row - 2];
2655: v2 = aa + ad[row] + 1;
2656: v3 = aa + ad[row - 1] + 1;
2657: for (j = 0; j < nz - 1; j += 2) {
2658: i0 = vi[j];
2659: i1 = vi[j + 1];
2660: tmp0 = tmps[i0];
2661: tmp1 = tmps[i1];
2662: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2663: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2664: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2665: }
2666: if (j == nz - 1) {
2667: tmp0 = tmps[vi[j]];
2668: sum1 -= v1[j] * tmp0;
2669: sum2 -= v2[j + 1] * tmp0;
2670: sum3 -= v3[j + 2] * tmp0;
2671: }
2672: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2673: row--;
2674: sum2 -= v2[0] * tmp0;
2675: sum3 -= v3[1] * tmp0;
2676: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2677: row--;
2678: sum3 -= v3[0] * tmp0;
2679: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2680: row--;
2682: break;
2683: case 4:
2684: sum1 = tmp[row];
2685: sum2 = tmp[row - 1];
2686: sum3 = tmp[row - 2];
2687: sum4 = tmp[row - 3];
2688: v2 = aa + ad[row] + 1;
2689: v3 = aa + ad[row - 1] + 1;
2690: v4 = aa + ad[row - 2] + 1;
2692: for (j = 0; j < nz - 1; j += 2) {
2693: i0 = vi[j];
2694: i1 = vi[j + 1];
2695: tmp0 = tmps[i0];
2696: tmp1 = tmps[i1];
2697: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2698: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2699: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2700: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2701: }
2702: if (j == nz - 1) {
2703: tmp0 = tmps[vi[j]];
2704: sum1 -= v1[j] * tmp0;
2705: sum2 -= v2[j + 1] * tmp0;
2706: sum3 -= v3[j + 2] * tmp0;
2707: sum4 -= v4[j + 3] * tmp0;
2708: }
2710: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2711: row--;
2712: sum2 -= v2[0] * tmp0;
2713: sum3 -= v3[1] * tmp0;
2714: sum4 -= v4[2] * tmp0;
2715: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2716: row--;
2717: sum3 -= v3[0] * tmp0;
2718: sum4 -= v4[1] * tmp0;
2719: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2720: row--;
2721: sum4 -= v4[0] * tmp0;
2722: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2723: row--;
2724: break;
2725: case 5:
2726: sum1 = tmp[row];
2727: sum2 = tmp[row - 1];
2728: sum3 = tmp[row - 2];
2729: sum4 = tmp[row - 3];
2730: sum5 = tmp[row - 4];
2731: v2 = aa + ad[row] + 1;
2732: v3 = aa + ad[row - 1] + 1;
2733: v4 = aa + ad[row - 2] + 1;
2734: v5 = aa + ad[row - 3] + 1;
2735: for (j = 0; j < nz - 1; j += 2) {
2736: i0 = vi[j];
2737: i1 = vi[j + 1];
2738: tmp0 = tmps[i0];
2739: tmp1 = tmps[i1];
2740: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2741: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2742: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2743: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2744: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2745: }
2746: if (j == nz - 1) {
2747: tmp0 = tmps[vi[j]];
2748: sum1 -= v1[j] * tmp0;
2749: sum2 -= v2[j + 1] * tmp0;
2750: sum3 -= v3[j + 2] * tmp0;
2751: sum4 -= v4[j + 3] * tmp0;
2752: sum5 -= v5[j + 4] * tmp0;
2753: }
2755: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2756: row--;
2757: sum2 -= v2[0] * tmp0;
2758: sum3 -= v3[1] * tmp0;
2759: sum4 -= v4[2] * tmp0;
2760: sum5 -= v5[3] * tmp0;
2761: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2762: row--;
2763: sum3 -= v3[0] * tmp0;
2764: sum4 -= v4[1] * tmp0;
2765: sum5 -= v5[2] * tmp0;
2766: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2767: row--;
2768: sum4 -= v4[0] * tmp0;
2769: sum5 -= v5[1] * tmp0;
2770: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2771: row--;
2772: sum5 -= v5[0] * tmp0;
2773: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2774: row--;
2775: break;
2776: default:
2777: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2778: }
2779: }
2780: PetscCall(ISRestoreIndices(isrow, &rout));
2781: PetscCall(ISRestoreIndices(iscol, &cout));
2782: PetscCall(VecRestoreArrayRead(bb, &b));
2783: PetscCall(VecRestoreArrayWrite(xx, &x));
2784: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2785: PetscFunctionReturn(PETSC_SUCCESS);
2786: }
2788: /*
2789: Makes a longer coloring[] array and calls the usual code with that
2790: */
2791: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2792: {
2793: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2794: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2795: PetscInt *colorused, i;
2796: ISColoringValue *newcolor;
2798: PetscFunctionBegin;
2799: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2800: PetscCall(PetscMalloc1(n + 1, &newcolor));
2801: /* loop over inodes, marking a color for each column*/
2802: row = 0;
2803: for (i = 0; i < m; i++) {
2804: for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2805: }
2807: /* eliminate unneeded colors */
2808: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2809: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2811: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2812: ncolors = colorused[5 * ncolors - 1];
2813: for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2814: PetscCall(PetscFree(colorused));
2815: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2816: PetscCall(PetscFree(coloring));
2817: PetscFunctionReturn(PETSC_SUCCESS);
2818: }
2820: #include <petsc/private/kernels/blockinvert.h>
2822: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2823: {
2824: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2825: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2826: MatScalar *ibdiag, *bdiag, work[25], *t;
2827: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2828: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2829: const PetscScalar *xb, *b;
2830: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2831: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2832: PetscInt sz, k, ipvt[5];
2833: PetscBool allowzeropivot, zeropivotdetected;
2834: const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2836: PetscFunctionBegin;
2837: allowzeropivot = PetscNot(A->erroriffailure);
2838: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2839: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2840: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2842: if (!a->inode.ibdiagvalid) {
2843: if (!a->inode.ibdiag) {
2844: /* calculate space needed for diagonal blocks */
2845: for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2846: a->inode.bdiagsize = cnt;
2848: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2849: }
2851: /* copy over the diagonal blocks and invert them */
2852: ibdiag = a->inode.ibdiag;
2853: bdiag = a->inode.bdiag;
2854: cnt = 0;
2855: for (i = 0, row = 0; i < m; i++) {
2856: for (j = 0; j < sizes[i]; j++) {
2857: for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2858: }
2859: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2861: switch (sizes[i]) {
2862: case 1:
2863: /* Create matrix data structure */
2864: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2865: if (allowzeropivot) {
2866: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2868: A->factorerror_zeropivot_row = row;
2869: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2870: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2871: }
2872: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2873: break;
2874: case 2:
2875: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2876: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2877: break;
2878: case 3:
2879: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2880: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2881: break;
2882: case 4:
2883: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2884: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2885: break;
2886: case 5:
2887: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2888: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2889: break;
2890: default:
2891: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2892: }
2893: cnt += sizes[i] * sizes[i];
2894: row += sizes[i];
2895: }
2896: a->inode.ibdiagvalid = PETSC_TRUE;
2897: }
2898: ibdiag = a->inode.ibdiag;
2899: bdiag = a->inode.bdiag;
2900: t = a->inode.ssor_work;
2902: PetscCall(VecGetArray(xx, &x));
2903: PetscCall(VecGetArrayRead(bb, &b));
2904: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2905: if (flag & SOR_ZERO_INITIAL_GUESS) {
2906: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2907: for (i = 0, row = 0; i < m; i++) {
2908: sz = diag[row] - ii[row];
2909: v1 = a->a + ii[row];
2910: idx = a->j + ii[row];
2912: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2913: switch (sizes[i]) {
2914: case 1:
2916: sum1 = b[row];
2917: for (n = 0; n < sz - 1; n += 2) {
2918: i1 = idx[0];
2919: i2 = idx[1];
2920: idx += 2;
2921: tmp0 = x[i1];
2922: tmp1 = x[i2];
2923: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2924: v1 += 2;
2925: }
2927: if (n == sz - 1) {
2928: tmp0 = x[*idx];
2929: sum1 -= *v1 * tmp0;
2930: }
2931: t[row] = sum1;
2932: x[row++] = sum1 * (*ibdiag++);
2933: break;
2934: case 2:
2935: v2 = a->a + ii[row + 1];
2936: sum1 = b[row];
2937: sum2 = b[row + 1];
2938: for (n = 0; n < sz - 1; n += 2) {
2939: i1 = idx[0];
2940: i2 = idx[1];
2941: idx += 2;
2942: tmp0 = x[i1];
2943: tmp1 = x[i2];
2944: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2945: v1 += 2;
2946: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2947: v2 += 2;
2948: }
2950: if (n == sz - 1) {
2951: tmp0 = x[*idx];
2952: sum1 -= v1[0] * tmp0;
2953: sum2 -= v2[0] * tmp0;
2954: }
2955: t[row] = sum1;
2956: t[row + 1] = sum2;
2957: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2958: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2959: ibdiag += 4;
2960: break;
2961: case 3:
2962: v2 = a->a + ii[row + 1];
2963: v3 = a->a + ii[row + 2];
2964: sum1 = b[row];
2965: sum2 = b[row + 1];
2966: sum3 = b[row + 2];
2967: for (n = 0; n < sz - 1; n += 2) {
2968: i1 = idx[0];
2969: i2 = idx[1];
2970: idx += 2;
2971: tmp0 = x[i1];
2972: tmp1 = x[i2];
2973: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2974: v1 += 2;
2975: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2976: v2 += 2;
2977: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2978: v3 += 2;
2979: }
2981: if (n == sz - 1) {
2982: tmp0 = x[*idx];
2983: sum1 -= v1[0] * tmp0;
2984: sum2 -= v2[0] * tmp0;
2985: sum3 -= v3[0] * tmp0;
2986: }
2987: t[row] = sum1;
2988: t[row + 1] = sum2;
2989: t[row + 2] = sum3;
2990: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2991: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2992: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2993: ibdiag += 9;
2994: break;
2995: case 4:
2996: v2 = a->a + ii[row + 1];
2997: v3 = a->a + ii[row + 2];
2998: v4 = a->a + ii[row + 3];
2999: sum1 = b[row];
3000: sum2 = b[row + 1];
3001: sum3 = b[row + 2];
3002: sum4 = b[row + 3];
3003: for (n = 0; n < sz - 1; n += 2) {
3004: i1 = idx[0];
3005: i2 = idx[1];
3006: idx += 2;
3007: tmp0 = x[i1];
3008: tmp1 = x[i2];
3009: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3010: v1 += 2;
3011: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3012: v2 += 2;
3013: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3014: v3 += 2;
3015: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3016: v4 += 2;
3017: }
3019: if (n == sz - 1) {
3020: tmp0 = x[*idx];
3021: sum1 -= v1[0] * tmp0;
3022: sum2 -= v2[0] * tmp0;
3023: sum3 -= v3[0] * tmp0;
3024: sum4 -= v4[0] * tmp0;
3025: }
3026: t[row] = sum1;
3027: t[row + 1] = sum2;
3028: t[row + 2] = sum3;
3029: t[row + 3] = sum4;
3030: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3031: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3032: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3033: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3034: ibdiag += 16;
3035: break;
3036: case 5:
3037: v2 = a->a + ii[row + 1];
3038: v3 = a->a + ii[row + 2];
3039: v4 = a->a + ii[row + 3];
3040: v5 = a->a + ii[row + 4];
3041: sum1 = b[row];
3042: sum2 = b[row + 1];
3043: sum3 = b[row + 2];
3044: sum4 = b[row + 3];
3045: sum5 = b[row + 4];
3046: for (n = 0; n < sz - 1; n += 2) {
3047: i1 = idx[0];
3048: i2 = idx[1];
3049: idx += 2;
3050: tmp0 = x[i1];
3051: tmp1 = x[i2];
3052: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3053: v1 += 2;
3054: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3055: v2 += 2;
3056: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3057: v3 += 2;
3058: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3059: v4 += 2;
3060: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3061: v5 += 2;
3062: }
3064: if (n == sz - 1) {
3065: tmp0 = x[*idx];
3066: sum1 -= v1[0] * tmp0;
3067: sum2 -= v2[0] * tmp0;
3068: sum3 -= v3[0] * tmp0;
3069: sum4 -= v4[0] * tmp0;
3070: sum5 -= v5[0] * tmp0;
3071: }
3072: t[row] = sum1;
3073: t[row + 1] = sum2;
3074: t[row + 2] = sum3;
3075: t[row + 3] = sum4;
3076: t[row + 4] = sum5;
3077: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3078: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3079: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3080: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3081: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3082: ibdiag += 25;
3083: break;
3084: default:
3085: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3086: }
3087: }
3089: xb = t;
3090: PetscCall(PetscLogFlops(a->nz));
3091: } else xb = b;
3092: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3093: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3094: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3095: ibdiag -= sizes[i] * sizes[i];
3096: sz = ii[row + 1] - diag[row] - 1;
3097: v1 = a->a + diag[row] + 1;
3098: idx = a->j + diag[row] + 1;
3100: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3101: switch (sizes[i]) {
3102: case 1:
3104: sum1 = xb[row];
3105: for (n = 0; n < sz - 1; n += 2) {
3106: i1 = idx[0];
3107: i2 = idx[1];
3108: idx += 2;
3109: tmp0 = x[i1];
3110: tmp1 = x[i2];
3111: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3112: v1 += 2;
3113: }
3115: if (n == sz - 1) {
3116: tmp0 = x[*idx];
3117: sum1 -= *v1 * tmp0;
3118: }
3119: x[row--] = sum1 * (*ibdiag);
3120: break;
3122: case 2:
3124: sum1 = xb[row];
3125: sum2 = xb[row - 1];
3126: /* note that sum1 is associated with the second of the two rows */
3127: v2 = a->a + diag[row - 1] + 2;
3128: for (n = 0; n < sz - 1; n += 2) {
3129: i1 = idx[0];
3130: i2 = idx[1];
3131: idx += 2;
3132: tmp0 = x[i1];
3133: tmp1 = x[i2];
3134: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135: v1 += 2;
3136: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3137: v2 += 2;
3138: }
3140: if (n == sz - 1) {
3141: tmp0 = x[*idx];
3142: sum1 -= *v1 * tmp0;
3143: sum2 -= *v2 * tmp0;
3144: }
3145: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3146: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3147: break;
3148: case 3:
3150: sum1 = xb[row];
3151: sum2 = xb[row - 1];
3152: sum3 = xb[row - 2];
3153: v2 = a->a + diag[row - 1] + 2;
3154: v3 = a->a + diag[row - 2] + 3;
3155: for (n = 0; n < sz - 1; n += 2) {
3156: i1 = idx[0];
3157: i2 = idx[1];
3158: idx += 2;
3159: tmp0 = x[i1];
3160: tmp1 = x[i2];
3161: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3162: v1 += 2;
3163: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3164: v2 += 2;
3165: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3166: v3 += 2;
3167: }
3169: if (n == sz - 1) {
3170: tmp0 = x[*idx];
3171: sum1 -= *v1 * tmp0;
3172: sum2 -= *v2 * tmp0;
3173: sum3 -= *v3 * tmp0;
3174: }
3175: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3176: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3177: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3178: break;
3179: case 4:
3181: sum1 = xb[row];
3182: sum2 = xb[row - 1];
3183: sum3 = xb[row - 2];
3184: sum4 = xb[row - 3];
3185: v2 = a->a + diag[row - 1] + 2;
3186: v3 = a->a + diag[row - 2] + 3;
3187: v4 = a->a + diag[row - 3] + 4;
3188: for (n = 0; n < sz - 1; n += 2) {
3189: i1 = idx[0];
3190: i2 = idx[1];
3191: idx += 2;
3192: tmp0 = x[i1];
3193: tmp1 = x[i2];
3194: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3195: v1 += 2;
3196: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3197: v2 += 2;
3198: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3199: v3 += 2;
3200: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3201: v4 += 2;
3202: }
3204: if (n == sz - 1) {
3205: tmp0 = x[*idx];
3206: sum1 -= *v1 * tmp0;
3207: sum2 -= *v2 * tmp0;
3208: sum3 -= *v3 * tmp0;
3209: sum4 -= *v4 * tmp0;
3210: }
3211: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3212: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3213: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3214: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3215: break;
3216: case 5:
3218: sum1 = xb[row];
3219: sum2 = xb[row - 1];
3220: sum3 = xb[row - 2];
3221: sum4 = xb[row - 3];
3222: sum5 = xb[row - 4];
3223: v2 = a->a + diag[row - 1] + 2;
3224: v3 = a->a + diag[row - 2] + 3;
3225: v4 = a->a + diag[row - 3] + 4;
3226: v5 = a->a + diag[row - 4] + 5;
3227: for (n = 0; n < sz - 1; n += 2) {
3228: i1 = idx[0];
3229: i2 = idx[1];
3230: idx += 2;
3231: tmp0 = x[i1];
3232: tmp1 = x[i2];
3233: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3234: v1 += 2;
3235: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3236: v2 += 2;
3237: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3238: v3 += 2;
3239: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3240: v4 += 2;
3241: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3242: v5 += 2;
3243: }
3245: if (n == sz - 1) {
3246: tmp0 = x[*idx];
3247: sum1 -= *v1 * tmp0;
3248: sum2 -= *v2 * tmp0;
3249: sum3 -= *v3 * tmp0;
3250: sum4 -= *v4 * tmp0;
3251: sum5 -= *v5 * tmp0;
3252: }
3253: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3254: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3255: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3256: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3257: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3258: break;
3259: default:
3260: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3261: }
3262: }
3264: PetscCall(PetscLogFlops(a->nz));
3265: }
3266: its--;
3267: }
3268: while (its--) {
3269: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3270: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3271: sz = diag[row] - ii[row];
3272: v1 = a->a + ii[row];
3273: idx = a->j + ii[row];
3274: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3275: switch (sizes[i]) {
3276: case 1:
3277: sum1 = b[row];
3278: for (n = 0; n < sz - 1; n += 2) {
3279: i1 = idx[0];
3280: i2 = idx[1];
3281: idx += 2;
3282: tmp0 = x[i1];
3283: tmp1 = x[i2];
3284: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3285: v1 += 2;
3286: }
3287: if (n == sz - 1) {
3288: tmp0 = x[*idx++];
3289: sum1 -= *v1 * tmp0;
3290: v1++;
3291: }
3292: t[row] = sum1;
3293: sz = ii[row + 1] - diag[row] - 1;
3294: idx = a->j + diag[row] + 1;
3295: v1 += 1;
3296: for (n = 0; n < sz - 1; n += 2) {
3297: i1 = idx[0];
3298: i2 = idx[1];
3299: idx += 2;
3300: tmp0 = x[i1];
3301: tmp1 = x[i2];
3302: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3303: v1 += 2;
3304: }
3305: if (n == sz - 1) {
3306: tmp0 = x[*idx++];
3307: sum1 -= *v1 * tmp0;
3308: }
3309: /* in MatSOR_SeqAIJ this line would be
3310: *
3311: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3312: *
3313: * but omega == 1, so this becomes
3314: *
3315: * x[row] = sum1*(*ibdiag++);
3316: *
3317: */
3318: x[row] = sum1 * (*ibdiag);
3319: break;
3320: case 2:
3321: v2 = a->a + ii[row + 1];
3322: sum1 = b[row];
3323: sum2 = b[row + 1];
3324: for (n = 0; n < sz - 1; n += 2) {
3325: i1 = idx[0];
3326: i2 = idx[1];
3327: idx += 2;
3328: tmp0 = x[i1];
3329: tmp1 = x[i2];
3330: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3331: v1 += 2;
3332: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3333: v2 += 2;
3334: }
3335: if (n == sz - 1) {
3336: tmp0 = x[*idx++];
3337: sum1 -= v1[0] * tmp0;
3338: sum2 -= v2[0] * tmp0;
3339: v1++;
3340: v2++;
3341: }
3342: t[row] = sum1;
3343: t[row + 1] = sum2;
3344: sz = ii[row + 1] - diag[row] - 2;
3345: idx = a->j + diag[row] + 2;
3346: v1 += 2;
3347: v2 += 2;
3348: for (n = 0; n < sz - 1; n += 2) {
3349: i1 = idx[0];
3350: i2 = idx[1];
3351: idx += 2;
3352: tmp0 = x[i1];
3353: tmp1 = x[i2];
3354: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355: v1 += 2;
3356: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357: v2 += 2;
3358: }
3359: if (n == sz - 1) {
3360: tmp0 = x[*idx];
3361: sum1 -= v1[0] * tmp0;
3362: sum2 -= v2[0] * tmp0;
3363: }
3364: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3365: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3366: break;
3367: case 3:
3368: v2 = a->a + ii[row + 1];
3369: v3 = a->a + ii[row + 2];
3370: sum1 = b[row];
3371: sum2 = b[row + 1];
3372: sum3 = b[row + 2];
3373: for (n = 0; n < sz - 1; n += 2) {
3374: i1 = idx[0];
3375: i2 = idx[1];
3376: idx += 2;
3377: tmp0 = x[i1];
3378: tmp1 = x[i2];
3379: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3380: v1 += 2;
3381: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3382: v2 += 2;
3383: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3384: v3 += 2;
3385: }
3386: if (n == sz - 1) {
3387: tmp0 = x[*idx++];
3388: sum1 -= v1[0] * tmp0;
3389: sum2 -= v2[0] * tmp0;
3390: sum3 -= v3[0] * tmp0;
3391: v1++;
3392: v2++;
3393: v3++;
3394: }
3395: t[row] = sum1;
3396: t[row + 1] = sum2;
3397: t[row + 2] = sum3;
3398: sz = ii[row + 1] - diag[row] - 3;
3399: idx = a->j + diag[row] + 3;
3400: v1 += 3;
3401: v2 += 3;
3402: v3 += 3;
3403: for (n = 0; n < sz - 1; n += 2) {
3404: i1 = idx[0];
3405: i2 = idx[1];
3406: idx += 2;
3407: tmp0 = x[i1];
3408: tmp1 = x[i2];
3409: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3410: v1 += 2;
3411: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3412: v2 += 2;
3413: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3414: v3 += 2;
3415: }
3416: if (n == sz - 1) {
3417: tmp0 = x[*idx];
3418: sum1 -= v1[0] * tmp0;
3419: sum2 -= v2[0] * tmp0;
3420: sum3 -= v3[0] * tmp0;
3421: }
3422: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3423: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3424: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3425: break;
3426: case 4:
3427: v2 = a->a + ii[row + 1];
3428: v3 = a->a + ii[row + 2];
3429: v4 = a->a + ii[row + 3];
3430: sum1 = b[row];
3431: sum2 = b[row + 1];
3432: sum3 = b[row + 2];
3433: sum4 = b[row + 3];
3434: for (n = 0; n < sz - 1; n += 2) {
3435: i1 = idx[0];
3436: i2 = idx[1];
3437: idx += 2;
3438: tmp0 = x[i1];
3439: tmp1 = x[i2];
3440: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3441: v1 += 2;
3442: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3443: v2 += 2;
3444: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3445: v3 += 2;
3446: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3447: v4 += 2;
3448: }
3449: if (n == sz - 1) {
3450: tmp0 = x[*idx++];
3451: sum1 -= v1[0] * tmp0;
3452: sum2 -= v2[0] * tmp0;
3453: sum3 -= v3[0] * tmp0;
3454: sum4 -= v4[0] * tmp0;
3455: v1++;
3456: v2++;
3457: v3++;
3458: v4++;
3459: }
3460: t[row] = sum1;
3461: t[row + 1] = sum2;
3462: t[row + 2] = sum3;
3463: t[row + 3] = sum4;
3464: sz = ii[row + 1] - diag[row] - 4;
3465: idx = a->j + diag[row] + 4;
3466: v1 += 4;
3467: v2 += 4;
3468: v3 += 4;
3469: v4 += 4;
3470: for (n = 0; n < sz - 1; n += 2) {
3471: i1 = idx[0];
3472: i2 = idx[1];
3473: idx += 2;
3474: tmp0 = x[i1];
3475: tmp1 = x[i2];
3476: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3477: v1 += 2;
3478: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3479: v2 += 2;
3480: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3481: v3 += 2;
3482: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3483: v4 += 2;
3484: }
3485: if (n == sz - 1) {
3486: tmp0 = x[*idx];
3487: sum1 -= v1[0] * tmp0;
3488: sum2 -= v2[0] * tmp0;
3489: sum3 -= v3[0] * tmp0;
3490: sum4 -= v4[0] * tmp0;
3491: }
3492: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3493: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3494: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3495: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3496: break;
3497: case 5:
3498: v2 = a->a + ii[row + 1];
3499: v3 = a->a + ii[row + 2];
3500: v4 = a->a + ii[row + 3];
3501: v5 = a->a + ii[row + 4];
3502: sum1 = b[row];
3503: sum2 = b[row + 1];
3504: sum3 = b[row + 2];
3505: sum4 = b[row + 3];
3506: sum5 = b[row + 4];
3507: for (n = 0; n < sz - 1; n += 2) {
3508: i1 = idx[0];
3509: i2 = idx[1];
3510: idx += 2;
3511: tmp0 = x[i1];
3512: tmp1 = x[i2];
3513: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3514: v1 += 2;
3515: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3516: v2 += 2;
3517: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3518: v3 += 2;
3519: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3520: v4 += 2;
3521: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3522: v5 += 2;
3523: }
3524: if (n == sz - 1) {
3525: tmp0 = x[*idx++];
3526: sum1 -= v1[0] * tmp0;
3527: sum2 -= v2[0] * tmp0;
3528: sum3 -= v3[0] * tmp0;
3529: sum4 -= v4[0] * tmp0;
3530: sum5 -= v5[0] * tmp0;
3531: v1++;
3532: v2++;
3533: v3++;
3534: v4++;
3535: v5++;
3536: }
3537: t[row] = sum1;
3538: t[row + 1] = sum2;
3539: t[row + 2] = sum3;
3540: t[row + 3] = sum4;
3541: t[row + 4] = sum5;
3542: sz = ii[row + 1] - diag[row] - 5;
3543: idx = a->j + diag[row] + 5;
3544: v1 += 5;
3545: v2 += 5;
3546: v3 += 5;
3547: v4 += 5;
3548: v5 += 5;
3549: for (n = 0; n < sz - 1; n += 2) {
3550: i1 = idx[0];
3551: i2 = idx[1];
3552: idx += 2;
3553: tmp0 = x[i1];
3554: tmp1 = x[i2];
3555: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3556: v1 += 2;
3557: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3558: v2 += 2;
3559: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3560: v3 += 2;
3561: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3562: v4 += 2;
3563: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3564: v5 += 2;
3565: }
3566: if (n == sz - 1) {
3567: tmp0 = x[*idx];
3568: sum1 -= v1[0] * tmp0;
3569: sum2 -= v2[0] * tmp0;
3570: sum3 -= v3[0] * tmp0;
3571: sum4 -= v4[0] * tmp0;
3572: sum5 -= v5[0] * tmp0;
3573: }
3574: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3575: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3576: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3577: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3578: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3579: break;
3580: default:
3581: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3582: }
3583: }
3584: xb = t;
3585: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3586: } else xb = b;
3588: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3589: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3590: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3591: ibdiag -= sizes[i] * sizes[i];
3593: /* set RHS */
3594: if (xb == b) {
3595: /* whole (old way) */
3596: sz = ii[row + 1] - ii[row];
3597: idx = a->j + ii[row];
3598: switch (sizes[i]) {
3599: case 5:
3600: v5 = a->a + ii[row - 4]; /* fall through */
3601: case 4:
3602: v4 = a->a + ii[row - 3]; /* fall through */
3603: case 3:
3604: v3 = a->a + ii[row - 2]; /* fall through */
3605: case 2:
3606: v2 = a->a + ii[row - 1]; /* fall through */
3607: case 1:
3608: v1 = a->a + ii[row];
3609: break;
3610: default:
3611: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3612: }
3613: } else {
3614: /* upper, no diag */
3615: sz = ii[row + 1] - diag[row] - 1;
3616: idx = a->j + diag[row] + 1;
3617: switch (sizes[i]) {
3618: case 5:
3619: v5 = a->a + diag[row - 4] + 5; /* fall through */
3620: case 4:
3621: v4 = a->a + diag[row - 3] + 4; /* fall through */
3622: case 3:
3623: v3 = a->a + diag[row - 2] + 3; /* fall through */
3624: case 2:
3625: v2 = a->a + diag[row - 1] + 2; /* fall through */
3626: case 1:
3627: v1 = a->a + diag[row] + 1;
3628: }
3629: }
3630: /* set sum */
3631: switch (sizes[i]) {
3632: case 5:
3633: sum5 = xb[row - 4]; /* fall through */
3634: case 4:
3635: sum4 = xb[row - 3]; /* fall through */
3636: case 3:
3637: sum3 = xb[row - 2]; /* fall through */
3638: case 2:
3639: sum2 = xb[row - 1]; /* fall through */
3640: case 1:
3641: /* note that sum1 is associated with the last row */
3642: sum1 = xb[row];
3643: }
3644: /* do sums */
3645: for (n = 0; n < sz - 1; n += 2) {
3646: i1 = idx[0];
3647: i2 = idx[1];
3648: idx += 2;
3649: tmp0 = x[i1];
3650: tmp1 = x[i2];
3651: switch (sizes[i]) {
3652: case 5:
3653: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3654: v5 += 2; /* fall through */
3655: case 4:
3656: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3657: v4 += 2; /* fall through */
3658: case 3:
3659: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3660: v3 += 2; /* fall through */
3661: case 2:
3662: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3663: v2 += 2; /* fall through */
3664: case 1:
3665: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3666: v1 += 2;
3667: }
3668: }
3669: /* ragged edge */
3670: if (n == sz - 1) {
3671: tmp0 = x[*idx];
3672: switch (sizes[i]) {
3673: case 5:
3674: sum5 -= *v5 * tmp0; /* fall through */
3675: case 4:
3676: sum4 -= *v4 * tmp0; /* fall through */
3677: case 3:
3678: sum3 -= *v3 * tmp0; /* fall through */
3679: case 2:
3680: sum2 -= *v2 * tmp0; /* fall through */
3681: case 1:
3682: sum1 -= *v1 * tmp0;
3683: }
3684: }
3685: /* update */
3686: if (xb == b) {
3687: /* whole (old way) w/ diag */
3688: switch (sizes[i]) {
3689: case 5:
3690: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3691: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3692: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3693: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3694: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3695: break;
3696: case 4:
3697: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3698: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3699: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3700: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3701: break;
3702: case 3:
3703: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3704: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3705: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3706: break;
3707: case 2:
3708: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3709: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3710: break;
3711: case 1:
3712: x[row--] += sum1 * (*ibdiag);
3713: break;
3714: }
3715: } else {
3716: /* no diag so set = */
3717: switch (sizes[i]) {
3718: case 5:
3719: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3720: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3721: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3722: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3723: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3724: break;
3725: case 4:
3726: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3727: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3728: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3729: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3730: break;
3731: case 3:
3732: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3733: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3734: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3735: break;
3736: case 2:
3737: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3738: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3739: break;
3740: case 1:
3741: x[row--] = sum1 * (*ibdiag);
3742: break;
3743: }
3744: }
3745: }
3746: if (xb == b) {
3747: PetscCall(PetscLogFlops(2.0 * a->nz));
3748: } else {
3749: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3750: }
3751: }
3752: }
3753: if (flag & SOR_EISENSTAT) {
3754: /*
3755: Apply (U + D)^-1 where D is now the block diagonal
3756: */
3757: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3758: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3759: ibdiag -= sizes[i] * sizes[i];
3760: sz = ii[row + 1] - diag[row] - 1;
3761: v1 = a->a + diag[row] + 1;
3762: idx = a->j + diag[row] + 1;
3763: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3764: switch (sizes[i]) {
3765: case 1:
3767: sum1 = b[row];
3768: for (n = 0; n < sz - 1; n += 2) {
3769: i1 = idx[0];
3770: i2 = idx[1];
3771: idx += 2;
3772: tmp0 = x[i1];
3773: tmp1 = x[i2];
3774: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3775: v1 += 2;
3776: }
3778: if (n == sz - 1) {
3779: tmp0 = x[*idx];
3780: sum1 -= *v1 * tmp0;
3781: }
3782: x[row] = sum1 * (*ibdiag);
3783: row--;
3784: break;
3786: case 2:
3788: sum1 = b[row];
3789: sum2 = b[row - 1];
3790: /* note that sum1 is associated with the second of the two rows */
3791: v2 = a->a + diag[row - 1] + 2;
3792: for (n = 0; n < sz - 1; n += 2) {
3793: i1 = idx[0];
3794: i2 = idx[1];
3795: idx += 2;
3796: tmp0 = x[i1];
3797: tmp1 = x[i2];
3798: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3799: v1 += 2;
3800: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3801: v2 += 2;
3802: }
3804: if (n == sz - 1) {
3805: tmp0 = x[*idx];
3806: sum1 -= *v1 * tmp0;
3807: sum2 -= *v2 * tmp0;
3808: }
3809: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3810: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3811: row -= 2;
3812: break;
3813: case 3:
3815: sum1 = b[row];
3816: sum2 = b[row - 1];
3817: sum3 = b[row - 2];
3818: v2 = a->a + diag[row - 1] + 2;
3819: v3 = a->a + diag[row - 2] + 3;
3820: for (n = 0; n < sz - 1; n += 2) {
3821: i1 = idx[0];
3822: i2 = idx[1];
3823: idx += 2;
3824: tmp0 = x[i1];
3825: tmp1 = x[i2];
3826: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3827: v1 += 2;
3828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3829: v2 += 2;
3830: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3831: v3 += 2;
3832: }
3834: if (n == sz - 1) {
3835: tmp0 = x[*idx];
3836: sum1 -= *v1 * tmp0;
3837: sum2 -= *v2 * tmp0;
3838: sum3 -= *v3 * tmp0;
3839: }
3840: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3841: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3842: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3843: row -= 3;
3844: break;
3845: case 4:
3847: sum1 = b[row];
3848: sum2 = b[row - 1];
3849: sum3 = b[row - 2];
3850: sum4 = b[row - 3];
3851: v2 = a->a + diag[row - 1] + 2;
3852: v3 = a->a + diag[row - 2] + 3;
3853: v4 = a->a + diag[row - 3] + 4;
3854: for (n = 0; n < sz - 1; n += 2) {
3855: i1 = idx[0];
3856: i2 = idx[1];
3857: idx += 2;
3858: tmp0 = x[i1];
3859: tmp1 = x[i2];
3860: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3861: v1 += 2;
3862: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3863: v2 += 2;
3864: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3865: v3 += 2;
3866: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3867: v4 += 2;
3868: }
3870: if (n == sz - 1) {
3871: tmp0 = x[*idx];
3872: sum1 -= *v1 * tmp0;
3873: sum2 -= *v2 * tmp0;
3874: sum3 -= *v3 * tmp0;
3875: sum4 -= *v4 * tmp0;
3876: }
3877: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3878: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3879: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3880: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3881: row -= 4;
3882: break;
3883: case 5:
3885: sum1 = b[row];
3886: sum2 = b[row - 1];
3887: sum3 = b[row - 2];
3888: sum4 = b[row - 3];
3889: sum5 = b[row - 4];
3890: v2 = a->a + diag[row - 1] + 2;
3891: v3 = a->a + diag[row - 2] + 3;
3892: v4 = a->a + diag[row - 3] + 4;
3893: v5 = a->a + diag[row - 4] + 5;
3894: for (n = 0; n < sz - 1; n += 2) {
3895: i1 = idx[0];
3896: i2 = idx[1];
3897: idx += 2;
3898: tmp0 = x[i1];
3899: tmp1 = x[i2];
3900: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3901: v1 += 2;
3902: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3903: v2 += 2;
3904: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3905: v3 += 2;
3906: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3907: v4 += 2;
3908: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3909: v5 += 2;
3910: }
3912: if (n == sz - 1) {
3913: tmp0 = x[*idx];
3914: sum1 -= *v1 * tmp0;
3915: sum2 -= *v2 * tmp0;
3916: sum3 -= *v3 * tmp0;
3917: sum4 -= *v4 * tmp0;
3918: sum5 -= *v5 * tmp0;
3919: }
3920: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3921: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3922: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3923: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3924: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3925: row -= 5;
3926: break;
3927: default:
3928: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3929: }
3930: }
3931: PetscCall(PetscLogFlops(a->nz));
3933: /*
3934: t = b - D x where D is the block diagonal
3935: */
3936: cnt = 0;
3937: for (i = 0, row = 0; i < m; i++) {
3938: switch (sizes[i]) {
3939: case 1:
3940: t[row] = b[row] - bdiag[cnt++] * x[row];
3941: row++;
3942: break;
3943: case 2:
3944: x1 = x[row];
3945: x2 = x[row + 1];
3946: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3947: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3948: t[row] = b[row] - tmp1;
3949: t[row + 1] = b[row + 1] - tmp2;
3950: row += 2;
3951: cnt += 4;
3952: break;
3953: case 3:
3954: x1 = x[row];
3955: x2 = x[row + 1];
3956: x3 = x[row + 2];
3957: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3958: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3959: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3960: t[row] = b[row] - tmp1;
3961: t[row + 1] = b[row + 1] - tmp2;
3962: t[row + 2] = b[row + 2] - tmp3;
3963: row += 3;
3964: cnt += 9;
3965: break;
3966: case 4:
3967: x1 = x[row];
3968: x2 = x[row + 1];
3969: x3 = x[row + 2];
3970: x4 = x[row + 3];
3971: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3972: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3973: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3974: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3975: t[row] = b[row] - tmp1;
3976: t[row + 1] = b[row + 1] - tmp2;
3977: t[row + 2] = b[row + 2] - tmp3;
3978: t[row + 3] = b[row + 3] - tmp4;
3979: row += 4;
3980: cnt += 16;
3981: break;
3982: case 5:
3983: x1 = x[row];
3984: x2 = x[row + 1];
3985: x3 = x[row + 2];
3986: x4 = x[row + 3];
3987: x5 = x[row + 4];
3988: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3989: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3990: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3991: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3992: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3993: t[row] = b[row] - tmp1;
3994: t[row + 1] = b[row + 1] - tmp2;
3995: t[row + 2] = b[row + 2] - tmp3;
3996: t[row + 3] = b[row + 3] - tmp4;
3997: t[row + 4] = b[row + 4] - tmp5;
3998: row += 5;
3999: cnt += 25;
4000: break;
4001: default:
4002: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4003: }
4004: }
4005: PetscCall(PetscLogFlops(m));
4007: /*
4008: Apply (L + D)^-1 where D is the block diagonal
4009: */
4010: for (i = 0, row = 0; i < m; i++) {
4011: sz = diag[row] - ii[row];
4012: v1 = a->a + ii[row];
4013: idx = a->j + ii[row];
4014: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4015: switch (sizes[i]) {
4016: case 1:
4018: sum1 = t[row];
4019: for (n = 0; n < sz - 1; n += 2) {
4020: i1 = idx[0];
4021: i2 = idx[1];
4022: idx += 2;
4023: tmp0 = t[i1];
4024: tmp1 = t[i2];
4025: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4026: v1 += 2;
4027: }
4029: if (n == sz - 1) {
4030: tmp0 = t[*idx];
4031: sum1 -= *v1 * tmp0;
4032: }
4033: x[row] += t[row] = sum1 * (*ibdiag++);
4034: row++;
4035: break;
4036: case 2:
4037: v2 = a->a + ii[row + 1];
4038: sum1 = t[row];
4039: sum2 = t[row + 1];
4040: for (n = 0; n < sz - 1; n += 2) {
4041: i1 = idx[0];
4042: i2 = idx[1];
4043: idx += 2;
4044: tmp0 = t[i1];
4045: tmp1 = t[i2];
4046: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4047: v1 += 2;
4048: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4049: v2 += 2;
4050: }
4052: if (n == sz - 1) {
4053: tmp0 = t[*idx];
4054: sum1 -= v1[0] * tmp0;
4055: sum2 -= v2[0] * tmp0;
4056: }
4057: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4058: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4059: ibdiag += 4;
4060: row += 2;
4061: break;
4062: case 3:
4063: v2 = a->a + ii[row + 1];
4064: v3 = a->a + ii[row + 2];
4065: sum1 = t[row];
4066: sum2 = t[row + 1];
4067: sum3 = t[row + 2];
4068: for (n = 0; n < sz - 1; n += 2) {
4069: i1 = idx[0];
4070: i2 = idx[1];
4071: idx += 2;
4072: tmp0 = t[i1];
4073: tmp1 = t[i2];
4074: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075: v1 += 2;
4076: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077: v2 += 2;
4078: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4079: v3 += 2;
4080: }
4082: if (n == sz - 1) {
4083: tmp0 = t[*idx];
4084: sum1 -= v1[0] * tmp0;
4085: sum2 -= v2[0] * tmp0;
4086: sum3 -= v3[0] * tmp0;
4087: }
4088: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4089: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4090: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4091: ibdiag += 9;
4092: row += 3;
4093: break;
4094: case 4:
4095: v2 = a->a + ii[row + 1];
4096: v3 = a->a + ii[row + 2];
4097: v4 = a->a + ii[row + 3];
4098: sum1 = t[row];
4099: sum2 = t[row + 1];
4100: sum3 = t[row + 2];
4101: sum4 = t[row + 3];
4102: for (n = 0; n < sz - 1; n += 2) {
4103: i1 = idx[0];
4104: i2 = idx[1];
4105: idx += 2;
4106: tmp0 = t[i1];
4107: tmp1 = t[i2];
4108: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4109: v1 += 2;
4110: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4111: v2 += 2;
4112: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4113: v3 += 2;
4114: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4115: v4 += 2;
4116: }
4118: if (n == sz - 1) {
4119: tmp0 = t[*idx];
4120: sum1 -= v1[0] * tmp0;
4121: sum2 -= v2[0] * tmp0;
4122: sum3 -= v3[0] * tmp0;
4123: sum4 -= v4[0] * tmp0;
4124: }
4125: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4126: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4127: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4128: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4129: ibdiag += 16;
4130: row += 4;
4131: break;
4132: case 5:
4133: v2 = a->a + ii[row + 1];
4134: v3 = a->a + ii[row + 2];
4135: v4 = a->a + ii[row + 3];
4136: v5 = a->a + ii[row + 4];
4137: sum1 = t[row];
4138: sum2 = t[row + 1];
4139: sum3 = t[row + 2];
4140: sum4 = t[row + 3];
4141: sum5 = t[row + 4];
4142: for (n = 0; n < sz - 1; n += 2) {
4143: i1 = idx[0];
4144: i2 = idx[1];
4145: idx += 2;
4146: tmp0 = t[i1];
4147: tmp1 = t[i2];
4148: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4149: v1 += 2;
4150: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4151: v2 += 2;
4152: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4153: v3 += 2;
4154: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4155: v4 += 2;
4156: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4157: v5 += 2;
4158: }
4160: if (n == sz - 1) {
4161: tmp0 = t[*idx];
4162: sum1 -= v1[0] * tmp0;
4163: sum2 -= v2[0] * tmp0;
4164: sum3 -= v3[0] * tmp0;
4165: sum4 -= v4[0] * tmp0;
4166: sum5 -= v5[0] * tmp0;
4167: }
4168: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4169: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4170: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4171: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4172: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4173: ibdiag += 25;
4174: row += 5;
4175: break;
4176: default:
4177: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4178: }
4179: }
4180: PetscCall(PetscLogFlops(a->nz));
4181: }
4182: PetscCall(VecRestoreArray(xx, &x));
4183: PetscCall(VecRestoreArrayRead(bb, &b));
4184: PetscFunctionReturn(PETSC_SUCCESS);
4185: }
4187: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4188: {
4189: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4190: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4191: const MatScalar *bdiag = a->inode.bdiag;
4192: const PetscScalar *b;
4193: PetscInt m = a->inode.node_count, cnt = 0, i, row;
4194: const PetscInt *sizes = a->inode.size;
4196: PetscFunctionBegin;
4197: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4198: PetscCall(VecGetArray(xx, &x));
4199: PetscCall(VecGetArrayRead(bb, &b));
4200: cnt = 0;
4201: for (i = 0, row = 0; i < m; i++) {
4202: switch (sizes[i]) {
4203: case 1:
4204: x[row] = b[row] * bdiag[cnt++];
4205: row++;
4206: break;
4207: case 2:
4208: x1 = b[row];
4209: x2 = b[row + 1];
4210: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4211: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4212: x[row++] = tmp1;
4213: x[row++] = tmp2;
4214: cnt += 4;
4215: break;
4216: case 3:
4217: x1 = b[row];
4218: x2 = b[row + 1];
4219: x3 = b[row + 2];
4220: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4221: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4222: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4223: x[row++] = tmp1;
4224: x[row++] = tmp2;
4225: x[row++] = tmp3;
4226: cnt += 9;
4227: break;
4228: case 4:
4229: x1 = b[row];
4230: x2 = b[row + 1];
4231: x3 = b[row + 2];
4232: x4 = b[row + 3];
4233: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4234: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4235: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4236: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4237: x[row++] = tmp1;
4238: x[row++] = tmp2;
4239: x[row++] = tmp3;
4240: x[row++] = tmp4;
4241: cnt += 16;
4242: break;
4243: case 5:
4244: x1 = b[row];
4245: x2 = b[row + 1];
4246: x3 = b[row + 2];
4247: x4 = b[row + 3];
4248: x5 = b[row + 4];
4249: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4250: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4251: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4252: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4253: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4254: x[row++] = tmp1;
4255: x[row++] = tmp2;
4256: x[row++] = tmp3;
4257: x[row++] = tmp4;
4258: x[row++] = tmp5;
4259: cnt += 25;
4260: break;
4261: default:
4262: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4263: }
4264: }
4265: PetscCall(PetscLogFlops(2.0 * cnt));
4266: PetscCall(VecRestoreArray(xx, &x));
4267: PetscCall(VecRestoreArrayRead(bb, &b));
4268: PetscFunctionReturn(PETSC_SUCCESS);
4269: }
4271: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4272: {
4273: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4275: PetscFunctionBegin;
4276: a->inode.node_count = 0;
4277: a->inode.use = PETSC_FALSE;
4278: a->inode.checked = PETSC_FALSE;
4279: a->inode.mat_nonzerostate = -1;
4280: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4281: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4282: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4283: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4284: A->ops->coloringpatch = NULL;
4285: A->ops->multdiagonalblock = NULL;
4286: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4287: PetscFunctionReturn(PETSC_SUCCESS);
4288: }
4290: /*
4291: samestructure indicates that the matrix has not changed its nonzero structure so we
4292: do not need to recompute the inodes
4293: */
4294: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4295: {
4296: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4297: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4298: PetscBool flag;
4299: const PetscInt *idx, *idy, *ii;
4301: PetscFunctionBegin;
4302: if (!a->inode.use) {
4303: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4304: PetscCall(PetscFree(a->inode.size));
4305: PetscFunctionReturn(PETSC_SUCCESS);
4306: }
4307: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4309: m = A->rmap->n;
4310: if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4311: ns = a->inode.size;
4313: i = 0;
4314: node_count = 0;
4315: idx = a->j;
4316: ii = a->i;
4317: if (idx) {
4318: while (i < m) { /* For each row */
4319: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4320: /* Limits the number of elements in a node to 'a->inode.limit' */
4321: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4322: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4323: if (nzy != nzx) break;
4324: idy += nzx; /* Same nonzero pattern */
4325: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4326: if (!flag) break;
4327: }
4328: ns[node_count++] = blk_size;
4329: idx += blk_size * nzx;
4330: i = j;
4331: }
4332: }
4333: /* If not enough inodes found,, do not use inode version of the routines */
4334: if (!m || !idx || node_count > .8 * m) {
4335: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4336: PetscCall(PetscFree(a->inode.size));
4337: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4338: } else {
4339: if (!A->factortype) {
4340: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4341: if (A->rmap->n == A->cmap->n) {
4342: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4343: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4344: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4345: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4346: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4347: }
4348: } else {
4349: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4350: }
4351: a->inode.node_count = node_count;
4352: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4353: }
4354: a->inode.checked = PETSC_TRUE;
4355: a->inode.mat_nonzerostate = A->nonzerostate;
4356: PetscFunctionReturn(PETSC_SUCCESS);
4357: }
4359: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4360: {
4361: Mat B = *C;
4362: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4363: PetscInt m = A->rmap->n;
4365: PetscFunctionBegin;
4366: c->inode.use = a->inode.use;
4367: c->inode.limit = a->inode.limit;
4368: c->inode.max_limit = a->inode.max_limit;
4369: c->inode.checked = PETSC_FALSE;
4370: c->inode.size = NULL;
4371: c->inode.node_count = 0;
4372: c->inode.ibdiagvalid = PETSC_FALSE;
4373: c->inode.ibdiag = NULL;
4374: c->inode.bdiag = NULL;
4375: c->inode.mat_nonzerostate = -1;
4376: if (a->inode.use) {
4377: if (a->inode.checked && a->inode.size) {
4378: PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4379: PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4381: c->inode.checked = PETSC_TRUE;
4382: c->inode.node_count = a->inode.node_count;
4383: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4384: }
4385: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4386: if (!B->factortype) {
4387: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4388: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4389: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4390: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4391: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4392: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4393: } else {
4394: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4395: }
4396: }
4397: PetscFunctionReturn(PETSC_SUCCESS);
4398: }
4400: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4401: {
4402: PetscInt k;
4403: const PetscInt *vi;
4405: PetscFunctionBegin;
4406: vi = aj + ai[row];
4407: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4408: vi = aj + adiag[row];
4409: cols[nzl] = vi[0];
4410: vi = aj + adiag[row + 1] + 1;
4411: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4412: PetscFunctionReturn(PETSC_SUCCESS);
4413: }
4414: /*
4415: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4416: Modified from MatSeqAIJCheckInode().
4418: Input Parameters:
4419: . Mat A - ILU or LU matrix factor
4421: */
4422: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4423: {
4424: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4425: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4426: PetscInt *cols1, *cols2, *ns;
4427: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4428: PetscBool flag;
4430: PetscFunctionBegin;
4431: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4432: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4434: m = A->rmap->n;
4435: if (a->inode.size) ns = a->inode.size;
4436: else PetscCall(PetscMalloc1(m + 1, &ns));
4438: i = 0;
4439: node_count = 0;
4440: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4441: while (i < m) { /* For each row */
4442: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4443: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4444: nzx = nzl1 + nzu1 + 1;
4445: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4447: /* Limits the number of elements in a node to 'a->inode.limit' */
4448: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4449: nzl2 = ai[j + 1] - ai[j];
4450: nzu2 = adiag[j] - adiag[j + 1] - 1;
4451: nzy = nzl2 + nzu2 + 1;
4452: if (nzy != nzx) break;
4453: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4454: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4455: if (!flag) break;
4456: }
4457: ns[node_count++] = blk_size;
4458: i = j;
4459: }
4460: PetscCall(PetscFree2(cols1, cols2));
4461: /* If not enough inodes found,, do not use inode version of the routines */
4462: if (!m || node_count > .8 * m) {
4463: PetscCall(PetscFree(ns));
4465: a->inode.node_count = 0;
4466: a->inode.size = NULL;
4467: a->inode.use = PETSC_FALSE;
4469: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4470: } else {
4471: A->ops->mult = NULL;
4472: A->ops->sor = NULL;
4473: A->ops->multadd = NULL;
4474: A->ops->getrowij = NULL;
4475: A->ops->restorerowij = NULL;
4476: A->ops->getcolumnij = NULL;
4477: A->ops->restorecolumnij = NULL;
4478: A->ops->coloringpatch = NULL;
4479: A->ops->multdiagonalblock = NULL;
4480: a->inode.node_count = node_count;
4481: a->inode.size = ns;
4483: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4484: }
4485: a->inode.checked = PETSC_TRUE;
4486: PetscFunctionReturn(PETSC_SUCCESS);
4487: }
4489: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4490: {
4491: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4493: PetscFunctionBegin;
4494: a->inode.ibdiagvalid = PETSC_FALSE;
4495: PetscFunctionReturn(PETSC_SUCCESS);
4496: }
4498: /*
4499: This is really ugly. if inodes are used this replaces the
4500: permutations with ones that correspond to rows/cols of the matrix
4501: rather than inode blocks
4502: */
4503: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4504: {
4505: PetscFunctionBegin;
4506: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4507: PetscFunctionReturn(PETSC_SUCCESS);
4508: }
4510: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4511: {
4512: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4513: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4514: const PetscInt *ridx, *cidx;
4515: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4516: PetscInt nslim_col, *ns_col;
4517: IS ris = *rperm, cis = *cperm;
4519: PetscFunctionBegin;
4520: if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4521: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4523: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4524: PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4525: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4527: PetscCall(ISGetIndices(ris, &ridx));
4528: PetscCall(ISGetIndices(cis, &cidx));
4530: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4531: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4533: /* Construct the permutations for rows*/
4534: for (i = 0, row = 0; i < nslim_row; ++i) {
4535: indx = ridx[i];
4536: start_val = tns[indx];
4537: end_val = tns[indx + 1];
4538: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4539: }
4541: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4542: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4544: /* Construct permutations for columns */
4545: for (i = 0, col = 0; i < nslim_col; ++i) {
4546: indx = cidx[i];
4547: start_val = tns[indx];
4548: end_val = tns[indx + 1];
4549: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4550: }
4552: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4553: PetscCall(ISSetPermutation(*rperm));
4554: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4555: PetscCall(ISSetPermutation(*cperm));
4557: PetscCall(ISRestoreIndices(ris, &ridx));
4558: PetscCall(ISRestoreIndices(cis, &cidx));
4560: PetscCall(PetscFree(ns_col));
4561: PetscCall(PetscFree2(permr, permc));
4562: PetscCall(ISDestroy(&cis));
4563: PetscCall(ISDestroy(&ris));
4564: PetscCall(PetscFree(tns));
4565: PetscFunctionReturn(PETSC_SUCCESS);
4566: }
4568: /*@C
4569: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4571: Not Collective
4573: Input Parameter:
4574: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4576: Output Parameters:
4577: + node_count - no of inodes present in the matrix.
4578: . sizes - an array of size `node_count`, with the sizes of each inode.
4579: - limit - the max size used to generate the inodes.
4581: Level: advanced
4583: Note:
4584: It should be called after the matrix is assembled.
4585: The contents of the sizes[] array should not be changed.
4586: `NULL` may be passed for information not needed
4588: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4589: @*/
4590: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4591: {
4592: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4594: PetscFunctionBegin;
4595: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4596: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4597: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4598: PetscFunctionReturn(PETSC_SUCCESS);
4599: }
4601: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4602: {
4603: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4605: PetscFunctionBegin;
4606: if (node_count) *node_count = a->inode.node_count;
4607: if (sizes) *sizes = a->inode.size;
4608: if (limit) *limit = a->inode.limit;
4609: PetscFunctionReturn(PETSC_SUCCESS);
4610: }