Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size_csr;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24: for (; count + 1 < n; count++, i++);
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
30: ns_col[0] = 0;
32: /* Use the same row structure wherever feasible. */
33: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
35: /* if m < n; pad up the remainder with inode_limit */
36: for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37: /* The last node is the odd ball. pad it up with the remaining rows; */
38: if (count < n) {
39: ns_col[i + 1] = ns_col[i] + (n - count);
40: i++;
41: } else if (count > n) {
42: /* Adjust for the over estimation */
43: ns_col[i] += n - count;
44: }
45: *size = i;
46: *ns = ns_col;
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: /*
51: This builds symmetric version of nonzero structure,
52: */
53: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54: {
55: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
56: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
60: PetscFunctionBegin;
61: nslim_row = a->inode.node_count;
62: m = A->rmap->n;
63: n = A->cmap->n;
64: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
67: /* Use the row_inode as column_inode */
68: nslim_col = nslim_row;
69: ns_col = ns_row;
71: /* allocate space for reformatted inode structure */
72: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
75: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76: nsz = ns_col[i1 + 1] - ns_col[i1];
77: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78: }
79: /* allocate space for row pointers */
80: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81: *iia = ia;
82: PetscCall(PetscMalloc1(nslim_row + 1, &work));
84: /* determine the number of columns in each row */
85: ia[0] = oshift;
86: for (i1 = 0; i1 < nslim_row; i1++) {
87: row = ns_row[i1];
88: j = aj + ai[row] + ishift;
89: jmax = aj + ai[row + 1] + ishift;
90: if (j == jmax) continue; /* empty row */
91: col = *j++ + ishift;
92: i2 = tvc[col];
93: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94: ia[i1 + 1]++;
95: ia[i2 + 1]++;
96: i2++; /* Start col of next node */
97: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98: i2 = tvc[col];
99: }
100: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101: }
103: /* shift ia[i] to point to next row */
104: for (i1 = 1; i1 < nslim_row + 1; i1++) {
105: row = ia[i1 - 1];
106: ia[i1] += row;
107: work[i1 - 1] = row - oshift;
108: }
110: /* allocate space for column pointers */
111: nz = ia[nslim_row] + (!ishift);
112: PetscCall(PetscMalloc1(nz, &ja));
113: *jja = ja;
115: /* loop over lower triangular part putting into ja */
116: for (i1 = 0; i1 < nslim_row; i1++) {
117: row = ns_row[i1];
118: j = aj + ai[row] + ishift;
119: jmax = aj + ai[row + 1] + ishift;
120: if (j == jmax) continue; /* empty row */
121: col = *j++ + ishift;
122: i2 = tvc[col];
123: while (i2 < i1 && j < jmax) {
124: ja[work[i2]++] = i1 + oshift;
125: ja[work[i1]++] = i2 + oshift;
126: ++i2;
127: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128: i2 = tvc[col];
129: }
130: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131: }
132: PetscCall(PetscFree(work));
133: PetscCall(PetscFree2(tns, tvc));
134: PetscFunctionReturn(PETSC_SUCCESS);
135: }
137: /*
138: This builds nonsymmetric version of nonzero structure,
139: */
140: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141: {
142: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
143: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144: PetscInt *tns, *tvc, nsz, i1, i2;
145: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
147: PetscFunctionBegin;
148: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149: nslim_row = a->inode.node_count;
150: n = A->cmap->n;
152: /* Create The column_inode for this matrix */
153: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
155: /* allocate space for reformatted column_inode structure */
156: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
159: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160: nsz = ns_col[i1 + 1] - ns_col[i1];
161: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162: }
163: /* allocate space for row pointers */
164: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165: *iia = ia;
166: PetscCall(PetscMalloc1(nslim_row + 1, &work));
168: /* determine the number of columns in each row */
169: ia[0] = oshift;
170: for (i1 = 0; i1 < nslim_row; i1++) {
171: row = ns_row[i1];
172: j = aj + ai[row] + ishift;
173: nz = ai[row + 1] - ai[row];
174: if (!nz) continue; /* empty row */
175: col = *j++ + ishift;
176: i2 = tvc[col];
177: while (nz-- > 0) { /* off-diagonal elements */
178: ia[i1 + 1]++;
179: i2++; /* Start col of next node */
180: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181: if (nz > 0) i2 = tvc[col];
182: }
183: }
185: /* shift ia[i] to point to next row */
186: for (i1 = 1; i1 < nslim_row + 1; i1++) {
187: row = ia[i1 - 1];
188: ia[i1] += row;
189: work[i1 - 1] = row - oshift;
190: }
192: /* allocate space for column pointers */
193: nz = ia[nslim_row] + (!ishift);
194: PetscCall(PetscMalloc1(nz, &ja));
195: *jja = ja;
197: /* loop over matrix putting into ja */
198: for (i1 = 0; i1 < nslim_row; i1++) {
199: row = ns_row[i1];
200: j = aj + ai[row] + ishift;
201: nz = ai[row + 1] - ai[row];
202: if (!nz) continue; /* empty row */
203: col = *j++ + ishift;
204: i2 = tvc[col];
205: while (nz-- > 0) {
206: ja[work[i1]++] = i2 + oshift;
207: ++i2;
208: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209: if (nz > 0) i2 = tvc[col];
210: }
211: }
212: PetscCall(PetscFree(ns_col));
213: PetscCall(PetscFree(work));
214: PetscCall(PetscFree2(tns, tvc));
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219: {
220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
222: PetscFunctionBegin;
223: if (n) *n = a->inode.node_count;
224: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225: if (!blockcompressed) {
226: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227: } else if (symmetric) {
228: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229: } else {
230: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231: }
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236: {
237: PetscFunctionBegin;
238: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
240: if (!blockcompressed) {
241: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242: } else {
243: PetscCall(PetscFree(*ia));
244: PetscCall(PetscFree(*ja));
245: }
246: PetscFunctionReturn(PETSC_SUCCESS);
247: }
249: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250: {
251: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
255: PetscFunctionBegin;
256: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257: nslim_row = a->inode.node_count;
258: n = A->cmap->n;
260: /* Create The column_inode for this matrix */
261: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
263: /* allocate space for reformatted column_inode structure */
264: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
267: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268: nsz = ns_col[i1 + 1] - ns_col[i1];
269: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270: }
271: /* allocate space for column pointers */
272: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273: *iia = ia;
274: PetscCall(PetscMalloc1(nslim_col + 1, &work));
276: /* determine the number of columns in each row */
277: ia[0] = oshift;
278: for (i1 = 0; i1 < nslim_row; i1++) {
279: row = ns_row[i1];
280: j = aj + ai[row] + ishift;
281: col = *j++ + ishift;
282: i2 = tvc[col];
283: nz = ai[row + 1] - ai[row];
284: while (nz-- > 0) { /* off-diagonal elements */
285: /* ia[i1+1]++; */
286: ia[i2 + 1]++;
287: i2++;
288: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289: if (nz > 0) i2 = tvc[col];
290: }
291: }
293: /* shift ia[i] to point to next col */
294: for (i1 = 1; i1 < nslim_col + 1; i1++) {
295: col = ia[i1 - 1];
296: ia[i1] += col;
297: work[i1 - 1] = col - oshift;
298: }
300: /* allocate space for column pointers */
301: nz = ia[nslim_col] + (!ishift);
302: PetscCall(PetscMalloc1(nz, &ja));
303: *jja = ja;
305: /* loop over matrix putting into ja */
306: for (i1 = 0; i1 < nslim_row; i1++) {
307: row = ns_row[i1];
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row + 1] - ai[row];
312: while (nz-- > 0) {
313: /* ja[work[i1]++] = i2 + oshift; */
314: ja[work[i2]++] = i1 + oshift;
315: i2++;
316: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
320: PetscCall(PetscFree(ns_col));
321: PetscCall(PetscFree(work));
322: PetscCall(PetscFree2(tns, tvc));
323: PetscFunctionReturn(PETSC_SUCCESS);
324: }
326: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327: {
328: PetscFunctionBegin;
329: PetscCall(MatCreateColInode_Private(A, n, NULL));
330: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
332: if (!blockcompressed) {
333: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334: } else if (symmetric) {
335: /* Since the indices are symmetric it doesn't matter */
336: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337: } else {
338: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339: }
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344: {
345: PetscFunctionBegin;
346: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347: if (!blockcompressed) {
348: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349: } else {
350: PetscCall(PetscFree(*ia));
351: PetscCall(PetscFree(*ja));
352: }
353: PetscFunctionReturn(PETSC_SUCCESS);
354: }
356: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357: {
358: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359: PetscScalar *y;
360: const PetscScalar *x;
361: PetscInt row, node_max, nonzerorow = 0;
362: PetscInt *ns;
364: PetscFunctionBegin;
365: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366: node_max = a->inode.node_count;
367: ns = a->inode.size_csr; /* Node Size array */
368: PetscCall(VecGetArrayRead(xx, &x));
369: PetscCall(VecGetArray(yy, &y));
371: PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372: for (PetscInt i = 0; i < node_max; ++i) {
373: PetscInt i1, i2, nsz, n, sz;
374: const MatScalar *v1, *v2, *v3, *v4, *v5;
375: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376: const PetscInt *idx;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380: #endif
381: row = ns[i];
382: nsz = ns[i + 1] - ns[i];
383: n = a->i[row + 1] - a->i[row];
384: nonzerorow += (n > 0) * nsz;
386: idx = &a->j[a->i[row]];
387: v1 = &a->a[a->i[row]];
388: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
389: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
390: sz = n; /* No of non zeros in this row */
391: /* Switch on the size of Node */
392: switch (nsz) { /* Each loop in 'case' is unrolled */
393: case 1:
394: sum1 = 0.;
396: for (n = 0; n < sz - 1; n += 2) {
397: i1 = idx[0]; /* The instructions are ordered to */
398: i2 = idx[1]; /* make the compiler's job easy */
399: idx += 2;
400: tmp0 = x[i1];
401: tmp1 = x[i2];
402: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403: v1 += 2;
404: }
406: if (n == sz - 1) { /* Take care of the last nonzero */
407: tmp0 = x[*idx++];
408: sum1 += *v1++ * tmp0;
409: }
410: y[row++] = sum1;
411: break;
412: case 2:
413: sum1 = 0.;
414: sum2 = 0.;
415: v2 = v1 + n;
417: for (n = 0; n < sz - 1; n += 2) {
418: i1 = idx[0];
419: i2 = idx[1];
420: idx += 2;
421: tmp0 = x[i1];
422: tmp1 = x[i2];
423: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424: v1 += 2;
425: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426: v2 += 2;
427: }
428: if (n == sz - 1) {
429: tmp0 = x[*idx++];
430: sum1 += *v1++ * tmp0;
431: sum2 += *v2++ * tmp0;
432: }
433: y[row++] = sum1;
434: y[row++] = sum2;
435: v1 = v2; /* Since the next block to be processed starts there*/
436: idx += sz;
437: break;
438: case 3:
439: sum1 = 0.;
440: sum2 = 0.;
441: sum3 = 0.;
442: v2 = v1 + n;
443: v3 = v2 + n;
445: for (n = 0; n < sz - 1; n += 2) {
446: i1 = idx[0];
447: i2 = idx[1];
448: idx += 2;
449: tmp0 = x[i1];
450: tmp1 = x[i2];
451: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452: v1 += 2;
453: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454: v2 += 2;
455: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456: v3 += 2;
457: }
458: if (n == sz - 1) {
459: tmp0 = x[*idx++];
460: sum1 += *v1++ * tmp0;
461: sum2 += *v2++ * tmp0;
462: sum3 += *v3++ * tmp0;
463: }
464: y[row++] = sum1;
465: y[row++] = sum2;
466: y[row++] = sum3;
467: v1 = v3; /* Since the next block to be processed starts there*/
468: idx += 2 * sz;
469: break;
470: case 4:
471: sum1 = 0.;
472: sum2 = 0.;
473: sum3 = 0.;
474: sum4 = 0.;
475: v2 = v1 + n;
476: v3 = v2 + n;
477: v4 = v3 + n;
479: for (n = 0; n < sz - 1; n += 2) {
480: i1 = idx[0];
481: i2 = idx[1];
482: idx += 2;
483: tmp0 = x[i1];
484: tmp1 = x[i2];
485: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486: v1 += 2;
487: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488: v2 += 2;
489: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490: v3 += 2;
491: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492: v4 += 2;
493: }
494: if (n == sz - 1) {
495: tmp0 = x[*idx++];
496: sum1 += *v1++ * tmp0;
497: sum2 += *v2++ * tmp0;
498: sum3 += *v3++ * tmp0;
499: sum4 += *v4++ * tmp0;
500: }
501: y[row++] = sum1;
502: y[row++] = sum2;
503: y[row++] = sum3;
504: y[row++] = sum4;
505: v1 = v4; /* Since the next block to be processed starts there*/
506: idx += 3 * sz;
507: break;
508: case 5:
509: sum1 = 0.;
510: sum2 = 0.;
511: sum3 = 0.;
512: sum4 = 0.;
513: sum5 = 0.;
514: v2 = v1 + n;
515: v3 = v2 + n;
516: v4 = v3 + n;
517: v5 = v4 + n;
519: for (n = 0; n < sz - 1; n += 2) {
520: i1 = idx[0];
521: i2 = idx[1];
522: idx += 2;
523: tmp0 = x[i1];
524: tmp1 = x[i2];
525: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526: v1 += 2;
527: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528: v2 += 2;
529: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530: v3 += 2;
531: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532: v4 += 2;
533: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534: v5 += 2;
535: }
536: if (n == sz - 1) {
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++] = sum1;
545: y[row++] = sum2;
546: y[row++] = sum3;
547: y[row++] = sum4;
548: y[row++] = sum5;
549: v1 = v5; /* Since the next block to be processed starts there */
550: idx += 4 * sz;
551: break;
552: default:
553: SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554: }
555: }
556: PetscCall(VecRestoreArrayRead(xx, &x));
557: PetscCall(VecRestoreArray(yy, &y));
558: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559: PetscFunctionReturn(PETSC_SUCCESS);
560: }
562: /* Almost same code as the MatMult_SeqAIJ_Inode() */
563: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
566: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567: const MatScalar *v1, *v2, *v3, *v4, *v5;
568: const PetscScalar *x;
569: PetscScalar *y, *z, *zt;
570: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
571: const PetscInt *idx, *ns, *ii;
573: PetscFunctionBegin;
574: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575: node_max = a->inode.node_count;
576: ns = a->inode.size_csr; /* Node Size array */
578: PetscCall(VecGetArrayRead(xx, &x));
579: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580: zt = z;
582: idx = a->j;
583: v1 = a->a;
584: ii = a->i;
586: for (i = 0; i < node_max; ++i) {
587: row = ns[i];
588: nsz = ns[i + 1] - ns[i];
589: n = ii[1] - ii[0];
590: ii += nsz;
591: sz = n; /* No of non zeros in this row */
592: /* Switch on the size of Node */
593: switch (nsz) { /* Each loop in 'case' is unrolled */
594: case 1:
595: sum1 = *zt++;
597: for (n = 0; n < sz - 1; n += 2) {
598: i1 = idx[0]; /* The instructions are ordered to */
599: i2 = idx[1]; /* make the compiler's job easy */
600: idx += 2;
601: tmp0 = x[i1];
602: tmp1 = x[i2];
603: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604: v1 += 2;
605: }
607: if (n == sz - 1) { /* Take care of the last nonzero */
608: tmp0 = x[*idx++];
609: sum1 += *v1++ * tmp0;
610: }
611: y[row++] = sum1;
612: break;
613: case 2:
614: sum1 = *zt++;
615: sum2 = *zt++;
616: v2 = v1 + n;
618: for (n = 0; n < sz - 1; n += 2) {
619: i1 = idx[0];
620: i2 = idx[1];
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625: v1 += 2;
626: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627: v2 += 2;
628: }
629: if (n == sz - 1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++] = sum1;
635: y[row++] = sum2;
636: v1 = v2; /* Since the next block to be processed starts there*/
637: idx += sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n < sz - 1; n += 2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653: v1 += 2;
654: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655: v2 += 2;
656: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657: v3 += 2;
658: }
659: if (n == sz - 1) {
660: tmp0 = x[*idx++];
661: sum1 += *v1++ * tmp0;
662: sum2 += *v2++ * tmp0;
663: sum3 += *v3++ * tmp0;
664: }
665: y[row++] = sum1;
666: y[row++] = sum2;
667: y[row++] = sum3;
668: v1 = v3; /* Since the next block to be processed starts there*/
669: idx += 2 * sz;
670: break;
671: case 4:
672: sum1 = *zt++;
673: sum2 = *zt++;
674: sum3 = *zt++;
675: sum4 = *zt++;
676: v2 = v1 + n;
677: v3 = v2 + n;
678: v4 = v3 + n;
680: for (n = 0; n < sz - 1; n += 2) {
681: i1 = idx[0];
682: i2 = idx[1];
683: idx += 2;
684: tmp0 = x[i1];
685: tmp1 = x[i2];
686: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687: v1 += 2;
688: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689: v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691: v3 += 2;
692: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693: v4 += 2;
694: }
695: if (n == sz - 1) {
696: tmp0 = x[*idx++];
697: sum1 += *v1++ * tmp0;
698: sum2 += *v2++ * tmp0;
699: sum3 += *v3++ * tmp0;
700: sum4 += *v4++ * tmp0;
701: }
702: y[row++] = sum1;
703: y[row++] = sum2;
704: y[row++] = sum3;
705: y[row++] = sum4;
706: v1 = v4; /* Since the next block to be processed starts there*/
707: idx += 3 * sz;
708: break;
709: case 5:
710: sum1 = *zt++;
711: sum2 = *zt++;
712: sum3 = *zt++;
713: sum4 = *zt++;
714: sum5 = *zt++;
715: v2 = v1 + n;
716: v3 = v2 + n;
717: v4 = v3 + n;
718: v5 = v4 + n;
720: for (n = 0; n < sz - 1; n += 2) {
721: i1 = idx[0];
722: i2 = idx[1];
723: idx += 2;
724: tmp0 = x[i1];
725: tmp1 = x[i2];
726: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727: v1 += 2;
728: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729: v2 += 2;
730: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731: v3 += 2;
732: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733: v4 += 2;
734: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735: v5 += 2;
736: }
737: if (n == sz - 1) {
738: tmp0 = x[*idx++];
739: sum1 += *v1++ * tmp0;
740: sum2 += *v2++ * tmp0;
741: sum3 += *v3++ * tmp0;
742: sum4 += *v4++ * tmp0;
743: sum5 += *v5++ * tmp0;
744: }
745: y[row++] = sum1;
746: y[row++] = sum2;
747: y[row++] = sum3;
748: y[row++] = sum4;
749: y[row++] = sum5;
750: v1 = v5; /* Since the next block to be processed starts there */
751: idx += 4 * sz;
752: break;
753: default:
754: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755: }
756: }
757: PetscCall(VecRestoreArrayRead(xx, &x));
758: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759: PetscCall(PetscLogFlops(2.0 * a->nz));
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }
763: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764: {
765: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
766: IS iscol = a->col, isrow = a->row;
767: const PetscInt *r, *c, *rout, *cout;
768: PetscInt i, j, n = A->rmap->n, nz;
769: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
770: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772: PetscScalar sum1, sum2, sum3, sum4, sum5;
773: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774: const PetscScalar *b;
776: PetscFunctionBegin;
777: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778: node_max = a->inode.node_count;
779: ns = a->inode.size_csr; /* Node Size array */
781: PetscCall(VecGetArrayRead(bb, &b));
782: PetscCall(VecGetArrayWrite(xx, &x));
783: tmp = a->solve_work;
785: PetscCall(ISGetIndices(isrow, &rout));
786: r = rout;
787: PetscCall(ISGetIndices(iscol, &cout));
788: c = cout + (n - 1);
790: /* forward solve the lower triangular */
791: tmps = tmp;
792: aa = a_a;
793: aj = a_j;
794: ad = a->diag;
796: for (i = 0, row = 0; i < node_max; ++i) {
797: row = ns[i];
798: nsz = ns[i + 1] - ns[i];
799: aii = ai[row];
800: v1 = aa + aii;
801: vi = aj + aii;
802: nz = ad[row] - aii;
803: if (i < node_max - 1) {
804: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805: * but our indexing to determine its size could. */
806: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808: PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810: }
812: switch (nsz) { /* Each loop in 'case' is unrolled */
813: case 1:
814: sum1 = b[*r++];
815: for (j = 0; j < nz - 1; j += 2) {
816: i0 = vi[0];
817: i1 = vi[1];
818: vi += 2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822: v1 += 2;
823: }
824: if (j == nz - 1) {
825: tmp0 = tmps[*vi++];
826: sum1 -= *v1++ * tmp0;
827: }
828: tmp[row++] = sum1;
829: break;
830: case 2:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: v2 = aa + ai[row + 1];
835: for (j = 0; j < nz - 1; j += 2) {
836: i0 = vi[0];
837: i1 = vi[1];
838: vi += 2;
839: tmp0 = tmps[i0];
840: tmp1 = tmps[i1];
841: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842: v1 += 2;
843: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844: v2 += 2;
845: }
846: if (j == nz - 1) {
847: tmp0 = tmps[*vi++];
848: sum1 -= *v1++ * tmp0;
849: sum2 -= *v2++ * tmp0;
850: }
851: sum2 -= *v2++ * sum1;
852: tmp[row++] = sum1;
853: tmp[row++] = sum2;
854: break;
855: case 3:
856: sum1 = b[*r++];
857: sum2 = b[*r++];
858: sum3 = b[*r++];
859: v2 = aa + ai[row + 1];
860: v3 = aa + ai[row + 2];
862: for (j = 0; j < nz - 1; j += 2) {
863: i0 = vi[0];
864: i1 = vi[1];
865: vi += 2;
866: tmp0 = tmps[i0];
867: tmp1 = tmps[i1];
868: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869: v1 += 2;
870: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871: v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873: v3 += 2;
874: }
875: if (j == nz - 1) {
876: tmp0 = tmps[*vi++];
877: sum1 -= *v1++ * tmp0;
878: sum2 -= *v2++ * tmp0;
879: sum3 -= *v3++ * tmp0;
880: }
881: sum2 -= *v2++ * sum1;
882: sum3 -= *v3++ * sum1;
883: sum3 -= *v3++ * sum2;
885: tmp[row++] = sum1;
886: tmp[row++] = sum2;
887: tmp[row++] = sum3;
888: break;
890: case 4:
891: sum1 = b[*r++];
892: sum2 = b[*r++];
893: sum3 = b[*r++];
894: sum4 = b[*r++];
895: v2 = aa + ai[row + 1];
896: v3 = aa + ai[row + 2];
897: v4 = aa + ai[row + 3];
899: for (j = 0; j < nz - 1; j += 2) {
900: i0 = vi[0];
901: i1 = vi[1];
902: vi += 2;
903: tmp0 = tmps[i0];
904: tmp1 = tmps[i1];
905: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906: v1 += 2;
907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908: v2 += 2;
909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910: v3 += 2;
911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912: v4 += 2;
913: }
914: if (j == nz - 1) {
915: tmp0 = tmps[*vi++];
916: sum1 -= *v1++ * tmp0;
917: sum2 -= *v2++ * tmp0;
918: sum3 -= *v3++ * tmp0;
919: sum4 -= *v4++ * tmp0;
920: }
921: sum2 -= *v2++ * sum1;
922: sum3 -= *v3++ * sum1;
923: sum4 -= *v4++ * sum1;
924: sum3 -= *v3++ * sum2;
925: sum4 -= *v4++ * sum2;
926: sum4 -= *v4++ * sum3;
928: tmp[row++] = sum1;
929: tmp[row++] = sum2;
930: tmp[row++] = sum3;
931: tmp[row++] = sum4;
932: break;
933: case 5:
934: sum1 = b[*r++];
935: sum2 = b[*r++];
936: sum3 = b[*r++];
937: sum4 = b[*r++];
938: sum5 = b[*r++];
939: v2 = aa + ai[row + 1];
940: v3 = aa + ai[row + 2];
941: v4 = aa + ai[row + 3];
942: v5 = aa + ai[row + 4];
944: for (j = 0; j < nz - 1; j += 2) {
945: i0 = vi[0];
946: i1 = vi[1];
947: vi += 2;
948: tmp0 = tmps[i0];
949: tmp1 = tmps[i1];
950: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951: v1 += 2;
952: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953: v2 += 2;
954: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955: v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957: v4 += 2;
958: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959: v5 += 2;
960: }
961: if (j == nz - 1) {
962: tmp0 = tmps[*vi++];
963: sum1 -= *v1++ * tmp0;
964: sum2 -= *v2++ * tmp0;
965: sum3 -= *v3++ * tmp0;
966: sum4 -= *v4++ * tmp0;
967: sum5 -= *v5++ * tmp0;
968: }
970: sum2 -= *v2++ * sum1;
971: sum3 -= *v3++ * sum1;
972: sum4 -= *v4++ * sum1;
973: sum5 -= *v5++ * sum1;
974: sum3 -= *v3++ * sum2;
975: sum4 -= *v4++ * sum2;
976: sum5 -= *v5++ * sum2;
977: sum4 -= *v4++ * sum3;
978: sum5 -= *v5++ * sum3;
979: sum5 -= *v5++ * sum4;
981: tmp[row++] = sum1;
982: tmp[row++] = sum2;
983: tmp[row++] = sum3;
984: tmp[row++] = sum4;
985: tmp[row++] = sum5;
986: break;
987: default:
988: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989: }
990: }
991: /* backward solve the upper triangular */
992: for (i = node_max - 1; i >= 0; i--) {
993: row = ns[i + 1];
994: nsz = ns[i + 1] - ns[i];
995: aii = ai[row + 1] - 1;
996: v1 = aa + aii;
997: vi = aj + aii;
998: nz = aii - ad[row];
999: switch (nsz) { /* Each loop in 'case' is unrolled */
1000: case 1:
1001: sum1 = tmp[row];
1003: for (j = nz; j > 1; j -= 2) {
1004: vi -= 2;
1005: i0 = vi[2];
1006: i1 = vi[1];
1007: tmp0 = tmps[i0];
1008: tmp1 = tmps[i1];
1009: v1 -= 2;
1010: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011: }
1012: if (j == 1) {
1013: tmp0 = tmps[*vi--];
1014: sum1 -= *v1-- * tmp0;
1015: }
1016: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017: row--;
1018: break;
1019: case 2:
1020: sum1 = tmp[row];
1021: sum2 = tmp[row - 1];
1022: v2 = aa + ai[row] - 1;
1023: for (j = nz; j > 1; j -= 2) {
1024: vi -= 2;
1025: i0 = vi[2];
1026: i1 = vi[1];
1027: tmp0 = tmps[i0];
1028: tmp1 = tmps[i1];
1029: v1 -= 2;
1030: v2 -= 2;
1031: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033: }
1034: if (j == 1) {
1035: tmp0 = tmps[*vi--];
1036: sum1 -= *v1-- * tmp0;
1037: sum2 -= *v2-- * tmp0;
1038: }
1040: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041: row--;
1042: sum2 -= *v2-- * tmp0;
1043: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044: row--;
1045: break;
1046: case 3:
1047: sum1 = tmp[row];
1048: sum2 = tmp[row - 1];
1049: sum3 = tmp[row - 2];
1050: v2 = aa + ai[row] - 1;
1051: v3 = aa + ai[row - 1] - 1;
1052: for (j = nz; j > 1; j -= 2) {
1053: vi -= 2;
1054: i0 = vi[2];
1055: i1 = vi[1];
1056: tmp0 = tmps[i0];
1057: tmp1 = tmps[i1];
1058: v1 -= 2;
1059: v2 -= 2;
1060: v3 -= 2;
1061: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064: }
1065: if (j == 1) {
1066: tmp0 = tmps[*vi--];
1067: sum1 -= *v1-- * tmp0;
1068: sum2 -= *v2-- * tmp0;
1069: sum3 -= *v3-- * tmp0;
1070: }
1071: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072: row--;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076: row--;
1077: sum3 -= *v3-- * tmp0;
1078: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079: row--;
1081: break;
1082: case 4:
1083: sum1 = tmp[row];
1084: sum2 = tmp[row - 1];
1085: sum3 = tmp[row - 2];
1086: sum4 = tmp[row - 3];
1087: v2 = aa + ai[row] - 1;
1088: v3 = aa + ai[row - 1] - 1;
1089: v4 = aa + ai[row - 2] - 1;
1091: for (j = nz; j > 1; j -= 2) {
1092: vi -= 2;
1093: i0 = vi[2];
1094: i1 = vi[1];
1095: tmp0 = tmps[i0];
1096: tmp1 = tmps[i1];
1097: v1 -= 2;
1098: v2 -= 2;
1099: v3 -= 2;
1100: v4 -= 2;
1101: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105: }
1106: if (j == 1) {
1107: tmp0 = tmps[*vi--];
1108: sum1 -= *v1-- * tmp0;
1109: sum2 -= *v2-- * tmp0;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: }
1114: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115: row--;
1116: sum2 -= *v2-- * tmp0;
1117: sum3 -= *v3-- * tmp0;
1118: sum4 -= *v4-- * tmp0;
1119: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120: row--;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124: row--;
1125: sum4 -= *v4-- * tmp0;
1126: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127: row--;
1128: break;
1129: case 5:
1130: sum1 = tmp[row];
1131: sum2 = tmp[row - 1];
1132: sum3 = tmp[row - 2];
1133: sum4 = tmp[row - 3];
1134: sum5 = tmp[row - 4];
1135: v2 = aa + ai[row] - 1;
1136: v3 = aa + ai[row - 1] - 1;
1137: v4 = aa + ai[row - 2] - 1;
1138: v5 = aa + ai[row - 3] - 1;
1139: for (j = nz; j > 1; j -= 2) {
1140: vi -= 2;
1141: i0 = vi[2];
1142: i1 = vi[1];
1143: tmp0 = tmps[i0];
1144: tmp1 = tmps[i1];
1145: v1 -= 2;
1146: v2 -= 2;
1147: v3 -= 2;
1148: v4 -= 2;
1149: v5 -= 2;
1150: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155: }
1156: if (j == 1) {
1157: tmp0 = tmps[*vi--];
1158: sum1 -= *v1-- * tmp0;
1159: sum2 -= *v2-- * tmp0;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: }
1165: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166: row--;
1167: sum2 -= *v2-- * tmp0;
1168: sum3 -= *v3-- * tmp0;
1169: sum4 -= *v4-- * tmp0;
1170: sum5 -= *v5-- * tmp0;
1171: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172: row--;
1173: sum3 -= *v3-- * tmp0;
1174: sum4 -= *v4-- * tmp0;
1175: sum5 -= *v5-- * tmp0;
1176: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177: row--;
1178: sum4 -= *v4-- * tmp0;
1179: sum5 -= *v5-- * tmp0;
1180: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181: row--;
1182: sum5 -= *v5-- * tmp0;
1183: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184: row--;
1185: break;
1186: default:
1187: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188: }
1189: }
1190: PetscCall(ISRestoreIndices(isrow, &rout));
1191: PetscCall(ISRestoreIndices(iscol, &cout));
1192: PetscCall(VecRestoreArrayRead(bb, &b));
1193: PetscCall(VecRestoreArrayWrite(xx, &x));
1194: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199: {
1200: Mat C = B;
1201: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202: IS isrow = b->row, isicol = b->icol;
1203: const PetscInt *r, *ic, *ics;
1204: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205: PetscInt i, j, k, nz, nzL, row, *pj;
1206: const PetscInt *ajtmp, *bjtmp;
1207: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209: FactorShiftCtx sctx;
1210: const PetscInt *ddiag;
1211: PetscReal rs;
1212: MatScalar d;
1213: PetscInt inod, nodesz, node_max, col;
1214: const PetscInt *ns;
1215: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1217: PetscFunctionBegin;
1218: /* MatPivotSetUp(): initialize shift context sctx */
1219: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1221: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222: ddiag = a->diag;
1223: sctx.shift_top = info->zeropivot;
1224: for (i = 0; i < n; i++) {
1225: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226: d = (aa)[ddiag[i]];
1227: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228: v = aa + ai[i];
1229: nz = ai[i + 1] - ai[i];
1230: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231: if (rs > sctx.shift_top) sctx.shift_top = rs;
1232: }
1233: sctx.shift_top *= 1.1;
1234: sctx.nshift_max = 5;
1235: sctx.shift_lo = 0.;
1236: sctx.shift_hi = 1.;
1237: }
1239: PetscCall(ISGetIndices(isrow, &r));
1240: PetscCall(ISGetIndices(isicol, &ic));
1242: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243: ics = ic;
1245: node_max = a->inode.node_count;
1246: ns = a->inode.size_csr;
1247: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1249: /* If max inode size > 4, split it into two inodes.*/
1250: /* also map the inode sizes according to the ordering */
1251: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253: nodesz = ns[i + 1] - ns[i];
1254: if (nodesz > 4) {
1255: tmp_vec1[j] = 4;
1256: ++j;
1257: tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258: } else {
1259: tmp_vec1[j] = nodesz;
1260: }
1261: }
1262: /* Use the correct node_max */
1263: node_max = j;
1265: /* Now reorder the inode info based on mat re-ordering info */
1266: /* First create a row -> inode_size_array_index map */
1267: PetscCall(PetscMalloc1(n + 1, &nsmap));
1268: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269: tmp_vec2[0] = 0;
1270: for (i = 0, row = 0; i < node_max; i++) {
1271: nodesz = tmp_vec1[i];
1272: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273: }
1274: /* Using nsmap, create a reordered ns structure */
1275: for (i = 0, j = 0; i < node_max; i++) {
1276: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277: tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278: j += nodesz;
1279: }
1280: PetscCall(PetscFree(nsmap));
1281: PetscCall(PetscFree(tmp_vec1));
1283: /* Now use the correct ns */
1284: ns = tmp_vec2;
1286: do {
1287: sctx.newshift = PETSC_FALSE;
1288: /* Now loop over each block-row, and do the factorization */
1289: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290: nodesz = ns[inod + 1] - ns[inod];
1292: switch (nodesz) {
1293: case 1:
1294: /* zero rtmp1 */
1295: /* L part */
1296: nz = bi[i + 1] - bi[i];
1297: bjtmp = bj + bi[i];
1298: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1300: /* U part */
1301: nz = bdiag[i] - bdiag[i + 1];
1302: bjtmp = bj + bdiag[i + 1] + 1;
1303: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1305: /* load in initial (unfactored row) */
1306: nz = ai[r[i] + 1] - ai[r[i]];
1307: ajtmp = aj + ai[r[i]];
1308: v = aa + ai[r[i]];
1309: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1311: /* ZeropivotApply() */
1312: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1314: /* elimination */
1315: bjtmp = bj + bi[i];
1316: row = *bjtmp++;
1317: nzL = bi[i + 1] - bi[i];
1318: for (k = 0; k < nzL; k++) {
1319: pc = rtmp1 + row;
1320: if (*pc != 0.0) {
1321: pv = b->a + bdiag[row];
1322: mul1 = *pc * (*pv);
1323: *pc = mul1;
1324: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325: pv = b->a + bdiag[row + 1] + 1;
1326: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329: }
1330: row = *bjtmp++;
1331: }
1333: /* finished row so stick it into b->a */
1334: rs = 0.0;
1335: /* L part */
1336: pv = b->a + bi[i];
1337: pj = b->j + bi[i];
1338: nz = bi[i + 1] - bi[i];
1339: for (j = 0; j < nz; j++) {
1340: pv[j] = rtmp1[pj[j]];
1341: rs += PetscAbsScalar(pv[j]);
1342: }
1344: /* U part */
1345: pv = b->a + bdiag[i + 1] + 1;
1346: pj = b->j + bdiag[i + 1] + 1;
1347: nz = bdiag[i] - bdiag[i + 1] - 1;
1348: for (j = 0; j < nz; j++) {
1349: pv[j] = rtmp1[pj[j]];
1350: rs += PetscAbsScalar(pv[j]);
1351: }
1353: /* Check zero pivot */
1354: sctx.rs = rs;
1355: sctx.pv = rtmp1[i];
1356: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357: if (sctx.newshift) break;
1359: /* Mark diagonal and invert diagonal for simpler triangular solves */
1360: pv = b->a + bdiag[i];
1361: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362: break;
1364: case 2:
1365: /* zero rtmp1 and rtmp2 */
1366: /* L part */
1367: nz = bi[i + 1] - bi[i];
1368: bjtmp = bj + bi[i];
1369: for (j = 0; j < nz; j++) {
1370: col = bjtmp[j];
1371: rtmp1[col] = 0.0;
1372: rtmp2[col] = 0.0;
1373: }
1375: /* U part */
1376: nz = bdiag[i] - bdiag[i + 1];
1377: bjtmp = bj + bdiag[i + 1] + 1;
1378: for (j = 0; j < nz; j++) {
1379: col = bjtmp[j];
1380: rtmp1[col] = 0.0;
1381: rtmp2[col] = 0.0;
1382: }
1384: /* load in initial (unfactored row) */
1385: nz = ai[r[i] + 1] - ai[r[i]];
1386: ajtmp = aj + ai[r[i]];
1387: v1 = aa + ai[r[i]];
1388: v2 = aa + ai[r[i + 1]];
1389: for (j = 0; j < nz; j++) {
1390: col = ics[ajtmp[j]];
1391: rtmp1[col] = v1[j];
1392: rtmp2[col] = v2[j];
1393: }
1394: /* ZeropivotApply(): shift the diagonal of the matrix */
1395: rtmp1[i] += sctx.shift_amount;
1396: rtmp2[i + 1] += sctx.shift_amount;
1398: /* elimination */
1399: bjtmp = bj + bi[i];
1400: row = *bjtmp++; /* pivot row */
1401: nzL = bi[i + 1] - bi[i];
1402: for (k = 0; k < nzL; k++) {
1403: pc1 = rtmp1 + row;
1404: pc2 = rtmp2 + row;
1405: if (*pc1 != 0.0 || *pc2 != 0.0) {
1406: pv = b->a + bdiag[row];
1407: mul1 = *pc1 * (*pv);
1408: mul2 = *pc2 * (*pv);
1409: *pc1 = mul1;
1410: *pc2 = mul2;
1412: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413: pv = b->a + bdiag[row + 1] + 1;
1414: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415: for (j = 0; j < nz; j++) {
1416: col = pj[j];
1417: rtmp1[col] -= mul1 * pv[j];
1418: rtmp2[col] -= mul2 * pv[j];
1419: }
1420: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421: }
1422: row = *bjtmp++;
1423: }
1425: /* finished row i; check zero pivot, then stick row i into b->a */
1426: rs = 0.0;
1427: /* L part */
1428: pc1 = b->a + bi[i];
1429: pj = b->j + bi[i];
1430: nz = bi[i + 1] - bi[i];
1431: for (j = 0; j < nz; j++) {
1432: col = pj[j];
1433: pc1[j] = rtmp1[col];
1434: rs += PetscAbsScalar(pc1[j]);
1435: }
1436: /* U part */
1437: pc1 = b->a + bdiag[i + 1] + 1;
1438: pj = b->j + bdiag[i + 1] + 1;
1439: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440: for (j = 0; j < nz; j++) {
1441: col = pj[j];
1442: pc1[j] = rtmp1[col];
1443: rs += PetscAbsScalar(pc1[j]);
1444: }
1446: sctx.rs = rs;
1447: sctx.pv = rtmp1[i];
1448: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449: if (sctx.newshift) break;
1450: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451: *pc1 = 1.0 / sctx.pv;
1453: /* Now take care of diagonal 2x2 block. */
1454: pc2 = rtmp2 + i;
1455: if (*pc2 != 0.0) {
1456: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457: *pc2 = mul1; /* insert L entry */
1458: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460: for (j = 0; j < nz; j++) {
1461: col = pj[j];
1462: rtmp2[col] -= mul1 * rtmp1[col];
1463: }
1464: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465: }
1467: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468: rs = 0.0;
1469: /* L part */
1470: pc2 = b->a + bi[i + 1];
1471: pj = b->j + bi[i + 1];
1472: nz = bi[i + 2] - bi[i + 1];
1473: for (j = 0; j < nz; j++) {
1474: col = pj[j];
1475: pc2[j] = rtmp2[col];
1476: rs += PetscAbsScalar(pc2[j]);
1477: }
1478: /* U part */
1479: pc2 = b->a + bdiag[i + 2] + 1;
1480: pj = b->j + bdiag[i + 2] + 1;
1481: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482: for (j = 0; j < nz; j++) {
1483: col = pj[j];
1484: pc2[j] = rtmp2[col];
1485: rs += PetscAbsScalar(pc2[j]);
1486: }
1488: sctx.rs = rs;
1489: sctx.pv = rtmp2[i + 1];
1490: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491: if (sctx.newshift) break;
1492: pc2 = b->a + bdiag[i + 1];
1493: *pc2 = 1.0 / sctx.pv;
1494: break;
1496: case 3:
1497: /* zero rtmp */
1498: /* L part */
1499: nz = bi[i + 1] - bi[i];
1500: bjtmp = bj + bi[i];
1501: for (j = 0; j < nz; j++) {
1502: col = bjtmp[j];
1503: rtmp1[col] = 0.0;
1504: rtmp2[col] = 0.0;
1505: rtmp3[col] = 0.0;
1506: }
1508: /* U part */
1509: nz = bdiag[i] - bdiag[i + 1];
1510: bjtmp = bj + bdiag[i + 1] + 1;
1511: for (j = 0; j < nz; j++) {
1512: col = bjtmp[j];
1513: rtmp1[col] = 0.0;
1514: rtmp2[col] = 0.0;
1515: rtmp3[col] = 0.0;
1516: }
1518: /* load in initial (unfactored row) */
1519: nz = ai[r[i] + 1] - ai[r[i]];
1520: ajtmp = aj + ai[r[i]];
1521: v1 = aa + ai[r[i]];
1522: v2 = aa + ai[r[i + 1]];
1523: v3 = aa + ai[r[i + 2]];
1524: for (j = 0; j < nz; j++) {
1525: col = ics[ajtmp[j]];
1526: rtmp1[col] = v1[j];
1527: rtmp2[col] = v2[j];
1528: rtmp3[col] = v3[j];
1529: }
1530: /* ZeropivotApply(): shift the diagonal of the matrix */
1531: rtmp1[i] += sctx.shift_amount;
1532: rtmp2[i + 1] += sctx.shift_amount;
1533: rtmp3[i + 2] += sctx.shift_amount;
1535: /* elimination */
1536: bjtmp = bj + bi[i];
1537: row = *bjtmp++; /* pivot row */
1538: nzL = bi[i + 1] - bi[i];
1539: for (k = 0; k < nzL; k++) {
1540: pc1 = rtmp1 + row;
1541: pc2 = rtmp2 + row;
1542: pc3 = rtmp3 + row;
1543: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544: pv = b->a + bdiag[row];
1545: mul1 = *pc1 * (*pv);
1546: mul2 = *pc2 * (*pv);
1547: mul3 = *pc3 * (*pv);
1548: *pc1 = mul1;
1549: *pc2 = mul2;
1550: *pc3 = mul3;
1552: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553: pv = b->a + bdiag[row + 1] + 1;
1554: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555: for (j = 0; j < nz; j++) {
1556: col = pj[j];
1557: rtmp1[col] -= mul1 * pv[j];
1558: rtmp2[col] -= mul2 * pv[j];
1559: rtmp3[col] -= mul3 * pv[j];
1560: }
1561: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562: }
1563: row = *bjtmp++;
1564: }
1566: /* finished row i; check zero pivot, then stick row i into b->a */
1567: rs = 0.0;
1568: /* L part */
1569: pc1 = b->a + bi[i];
1570: pj = b->j + bi[i];
1571: nz = bi[i + 1] - bi[i];
1572: for (j = 0; j < nz; j++) {
1573: col = pj[j];
1574: pc1[j] = rtmp1[col];
1575: rs += PetscAbsScalar(pc1[j]);
1576: }
1577: /* U part */
1578: pc1 = b->a + bdiag[i + 1] + 1;
1579: pj = b->j + bdiag[i + 1] + 1;
1580: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581: for (j = 0; j < nz; j++) {
1582: col = pj[j];
1583: pc1[j] = rtmp1[col];
1584: rs += PetscAbsScalar(pc1[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp1[i];
1589: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590: if (sctx.newshift) break;
1591: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592: *pc1 = 1.0 / sctx.pv;
1594: /* Now take care of 1st column of diagonal 3x3 block. */
1595: pc2 = rtmp2 + i;
1596: pc3 = rtmp3 + i;
1597: if (*pc2 != 0.0 || *pc3 != 0.0) {
1598: mul2 = (*pc2) * (*pc1);
1599: *pc2 = mul2;
1600: mul3 = (*pc3) * (*pc1);
1601: *pc3 = mul3;
1602: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1603: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604: for (j = 0; j < nz; j++) {
1605: col = pj[j];
1606: rtmp2[col] -= mul2 * rtmp1[col];
1607: rtmp3[col] -= mul3 * rtmp1[col];
1608: }
1609: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610: }
1612: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613: rs = 0.0;
1614: /* L part */
1615: pc2 = b->a + bi[i + 1];
1616: pj = b->j + bi[i + 1];
1617: nz = bi[i + 2] - bi[i + 1];
1618: for (j = 0; j < nz; j++) {
1619: col = pj[j];
1620: pc2[j] = rtmp2[col];
1621: rs += PetscAbsScalar(pc2[j]);
1622: }
1623: /* U part */
1624: pc2 = b->a + bdiag[i + 2] + 1;
1625: pj = b->j + bdiag[i + 2] + 1;
1626: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627: for (j = 0; j < nz; j++) {
1628: col = pj[j];
1629: pc2[j] = rtmp2[col];
1630: rs += PetscAbsScalar(pc2[j]);
1631: }
1633: sctx.rs = rs;
1634: sctx.pv = rtmp2[i + 1];
1635: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636: if (sctx.newshift) break;
1637: pc2 = b->a + bdiag[i + 1];
1638: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1640: /* Now take care of 2nd column of diagonal 3x3 block. */
1641: pc3 = rtmp3 + i + 1;
1642: if (*pc3 != 0.0) {
1643: mul3 = (*pc3) * (*pc2);
1644: *pc3 = mul3;
1645: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1646: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647: for (j = 0; j < nz; j++) {
1648: col = pj[j];
1649: rtmp3[col] -= mul3 * rtmp2[col];
1650: }
1651: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652: }
1654: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655: rs = 0.0;
1656: /* L part */
1657: pc3 = b->a + bi[i + 2];
1658: pj = b->j + bi[i + 2];
1659: nz = bi[i + 3] - bi[i + 2];
1660: for (j = 0; j < nz; j++) {
1661: col = pj[j];
1662: pc3[j] = rtmp3[col];
1663: rs += PetscAbsScalar(pc3[j]);
1664: }
1665: /* U part */
1666: pc3 = b->a + bdiag[i + 3] + 1;
1667: pj = b->j + bdiag[i + 3] + 1;
1668: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669: for (j = 0; j < nz; j++) {
1670: col = pj[j];
1671: pc3[j] = rtmp3[col];
1672: rs += PetscAbsScalar(pc3[j]);
1673: }
1675: sctx.rs = rs;
1676: sctx.pv = rtmp3[i + 2];
1677: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678: if (sctx.newshift) break;
1679: pc3 = b->a + bdiag[i + 2];
1680: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681: break;
1682: case 4:
1683: /* zero rtmp */
1684: /* L part */
1685: nz = bi[i + 1] - bi[i];
1686: bjtmp = bj + bi[i];
1687: for (j = 0; j < nz; j++) {
1688: col = bjtmp[j];
1689: rtmp1[col] = 0.0;
1690: rtmp2[col] = 0.0;
1691: rtmp3[col] = 0.0;
1692: rtmp4[col] = 0.0;
1693: }
1695: /* U part */
1696: nz = bdiag[i] - bdiag[i + 1];
1697: bjtmp = bj + bdiag[i + 1] + 1;
1698: for (j = 0; j < nz; j++) {
1699: col = bjtmp[j];
1700: rtmp1[col] = 0.0;
1701: rtmp2[col] = 0.0;
1702: rtmp3[col] = 0.0;
1703: rtmp4[col] = 0.0;
1704: }
1706: /* load in initial (unfactored row) */
1707: nz = ai[r[i] + 1] - ai[r[i]];
1708: ajtmp = aj + ai[r[i]];
1709: v1 = aa + ai[r[i]];
1710: v2 = aa + ai[r[i + 1]];
1711: v3 = aa + ai[r[i + 2]];
1712: v4 = aa + ai[r[i + 3]];
1713: for (j = 0; j < nz; j++) {
1714: col = ics[ajtmp[j]];
1715: rtmp1[col] = v1[j];
1716: rtmp2[col] = v2[j];
1717: rtmp3[col] = v3[j];
1718: rtmp4[col] = v4[j];
1719: }
1720: /* ZeropivotApply(): shift the diagonal of the matrix */
1721: rtmp1[i] += sctx.shift_amount;
1722: rtmp2[i + 1] += sctx.shift_amount;
1723: rtmp3[i + 2] += sctx.shift_amount;
1724: rtmp4[i + 3] += sctx.shift_amount;
1726: /* elimination */
1727: bjtmp = bj + bi[i];
1728: row = *bjtmp++; /* pivot row */
1729: nzL = bi[i + 1] - bi[i];
1730: for (k = 0; k < nzL; k++) {
1731: pc1 = rtmp1 + row;
1732: pc2 = rtmp2 + row;
1733: pc3 = rtmp3 + row;
1734: pc4 = rtmp4 + row;
1735: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736: pv = b->a + bdiag[row];
1737: mul1 = *pc1 * (*pv);
1738: mul2 = *pc2 * (*pv);
1739: mul3 = *pc3 * (*pv);
1740: mul4 = *pc4 * (*pv);
1741: *pc1 = mul1;
1742: *pc2 = mul2;
1743: *pc3 = mul3;
1744: *pc4 = mul4;
1746: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747: pv = b->a + bdiag[row + 1] + 1;
1748: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749: for (j = 0; j < nz; j++) {
1750: col = pj[j];
1751: rtmp1[col] -= mul1 * pv[j];
1752: rtmp2[col] -= mul2 * pv[j];
1753: rtmp3[col] -= mul3 * pv[j];
1754: rtmp4[col] -= mul4 * pv[j];
1755: }
1756: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757: }
1758: row = *bjtmp++;
1759: }
1761: /* finished row i; check zero pivot, then stick row i into b->a */
1762: rs = 0.0;
1763: /* L part */
1764: pc1 = b->a + bi[i];
1765: pj = b->j + bi[i];
1766: nz = bi[i + 1] - bi[i];
1767: for (j = 0; j < nz; j++) {
1768: col = pj[j];
1769: pc1[j] = rtmp1[col];
1770: rs += PetscAbsScalar(pc1[j]);
1771: }
1772: /* U part */
1773: pc1 = b->a + bdiag[i + 1] + 1;
1774: pj = b->j + bdiag[i + 1] + 1;
1775: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776: for (j = 0; j < nz; j++) {
1777: col = pj[j];
1778: pc1[j] = rtmp1[col];
1779: rs += PetscAbsScalar(pc1[j]);
1780: }
1782: sctx.rs = rs;
1783: sctx.pv = rtmp1[i];
1784: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785: if (sctx.newshift) break;
1786: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1787: *pc1 = 1.0 / sctx.pv;
1789: /* Now take care of 1st column of diagonal 4x4 block. */
1790: pc2 = rtmp2 + i;
1791: pc3 = rtmp3 + i;
1792: pc4 = rtmp4 + i;
1793: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794: mul2 = (*pc2) * (*pc1);
1795: *pc2 = mul2;
1796: mul3 = (*pc3) * (*pc1);
1797: *pc3 = mul3;
1798: mul4 = (*pc4) * (*pc1);
1799: *pc4 = mul4;
1800: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1801: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: rtmp2[col] -= mul2 * rtmp1[col];
1805: rtmp3[col] -= mul3 * rtmp1[col];
1806: rtmp4[col] -= mul4 * rtmp1[col];
1807: }
1808: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809: }
1811: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812: rs = 0.0;
1813: /* L part */
1814: pc2 = b->a + bi[i + 1];
1815: pj = b->j + bi[i + 1];
1816: nz = bi[i + 2] - bi[i + 1];
1817: for (j = 0; j < nz; j++) {
1818: col = pj[j];
1819: pc2[j] = rtmp2[col];
1820: rs += PetscAbsScalar(pc2[j]);
1821: }
1822: /* U part */
1823: pc2 = b->a + bdiag[i + 2] + 1;
1824: pj = b->j + bdiag[i + 2] + 1;
1825: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826: for (j = 0; j < nz; j++) {
1827: col = pj[j];
1828: pc2[j] = rtmp2[col];
1829: rs += PetscAbsScalar(pc2[j]);
1830: }
1832: sctx.rs = rs;
1833: sctx.pv = rtmp2[i + 1];
1834: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835: if (sctx.newshift) break;
1836: pc2 = b->a + bdiag[i + 1];
1837: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1839: /* Now take care of 2nd column of diagonal 4x4 block. */
1840: pc3 = rtmp3 + i + 1;
1841: pc4 = rtmp4 + i + 1;
1842: if (*pc3 != 0.0 || *pc4 != 0.0) {
1843: mul3 = (*pc3) * (*pc2);
1844: *pc3 = mul3;
1845: mul4 = (*pc4) * (*pc2);
1846: *pc4 = mul4;
1847: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1848: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849: for (j = 0; j < nz; j++) {
1850: col = pj[j];
1851: rtmp3[col] -= mul3 * rtmp2[col];
1852: rtmp4[col] -= mul4 * rtmp2[col];
1853: }
1854: PetscCall(PetscLogFlops(4.0 * nz));
1855: }
1857: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858: rs = 0.0;
1859: /* L part */
1860: pc3 = b->a + bi[i + 2];
1861: pj = b->j + bi[i + 2];
1862: nz = bi[i + 3] - bi[i + 2];
1863: for (j = 0; j < nz; j++) {
1864: col = pj[j];
1865: pc3[j] = rtmp3[col];
1866: rs += PetscAbsScalar(pc3[j]);
1867: }
1868: /* U part */
1869: pc3 = b->a + bdiag[i + 3] + 1;
1870: pj = b->j + bdiag[i + 3] + 1;
1871: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872: for (j = 0; j < nz; j++) {
1873: col = pj[j];
1874: pc3[j] = rtmp3[col];
1875: rs += PetscAbsScalar(pc3[j]);
1876: }
1878: sctx.rs = rs;
1879: sctx.pv = rtmp3[i + 2];
1880: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881: if (sctx.newshift) break;
1882: pc3 = b->a + bdiag[i + 2];
1883: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1885: /* Now take care of 3rd column of diagonal 4x4 block. */
1886: pc4 = rtmp4 + i + 2;
1887: if (*pc4 != 0.0) {
1888: mul4 = (*pc4) * (*pc3);
1889: *pc4 = mul4;
1890: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1891: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892: for (j = 0; j < nz; j++) {
1893: col = pj[j];
1894: rtmp4[col] -= mul4 * rtmp3[col];
1895: }
1896: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897: }
1899: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900: rs = 0.0;
1901: /* L part */
1902: pc4 = b->a + bi[i + 3];
1903: pj = b->j + bi[i + 3];
1904: nz = bi[i + 4] - bi[i + 3];
1905: for (j = 0; j < nz; j++) {
1906: col = pj[j];
1907: pc4[j] = rtmp4[col];
1908: rs += PetscAbsScalar(pc4[j]);
1909: }
1910: /* U part */
1911: pc4 = b->a + bdiag[i + 4] + 1;
1912: pj = b->j + bdiag[i + 4] + 1;
1913: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914: for (j = 0; j < nz; j++) {
1915: col = pj[j];
1916: pc4[j] = rtmp4[col];
1917: rs += PetscAbsScalar(pc4[j]);
1918: }
1920: sctx.rs = rs;
1921: sctx.pv = rtmp4[i + 3];
1922: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923: if (sctx.newshift) break;
1924: pc4 = b->a + bdiag[i + 3];
1925: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926: break;
1928: default:
1929: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930: }
1931: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932: i += nodesz; /* Update the row */
1933: }
1935: /* MatPivotRefine() */
1936: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937: /*
1938: * if no shift in this attempt & shifting & started shifting & can refine,
1939: * then try lower shift
1940: */
1941: sctx.shift_hi = sctx.shift_fraction;
1942: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1944: sctx.newshift = PETSC_TRUE;
1945: sctx.nshift++;
1946: }
1947: } while (sctx.newshift);
1949: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950: PetscCall(PetscFree(tmp_vec2));
1951: PetscCall(ISRestoreIndices(isicol, &ic));
1952: PetscCall(ISRestoreIndices(isrow, &r));
1954: if (b->inode.size_csr) {
1955: C->ops->solve = MatSolve_SeqAIJ_Inode;
1956: } else {
1957: C->ops->solve = MatSolve_SeqAIJ;
1958: }
1959: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1960: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1961: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962: C->ops->matsolve = MatMatSolve_SeqAIJ;
1963: C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964: C->assembled = PETSC_TRUE;
1965: C->preallocated = PETSC_TRUE;
1967: PetscCall(PetscLogFlops(C->cmap->n));
1969: /* MatShiftView(A,info,&sctx) */
1970: if (sctx.nshift) {
1971: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977: }
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
1983: {
1984: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1985: IS iscol = a->col, isrow = a->row;
1986: const PetscInt *r, *c, *rout, *cout;
1987: PetscInt i, j;
1988: PetscInt node_max, row, nsz, aii, i0, i1, nz;
1989: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
1990: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
1991: PetscScalar sum1, sum2, sum3, sum4, sum5;
1992: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
1993: const PetscScalar *b;
1995: PetscFunctionBegin;
1996: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1997: node_max = a->inode.node_count;
1998: ns = a->inode.size_csr; /* Node Size array */
2000: PetscCall(VecGetArrayRead(bb, &b));
2001: PetscCall(VecGetArrayWrite(xx, &x));
2002: tmp = a->solve_work;
2004: PetscCall(ISGetIndices(isrow, &rout));
2005: r = rout;
2006: PetscCall(ISGetIndices(iscol, &cout));
2007: c = cout;
2009: /* forward solve the lower triangular */
2010: tmps = tmp;
2011: aa = a_a;
2012: aj = a_j;
2013: ad = a->diag;
2015: for (i = 0; i < node_max; ++i) {
2016: row = ns[i];
2017: nsz = ns[i + 1] - ns[i];
2018: aii = ai[row];
2019: v1 = aa + aii;
2020: vi = aj + aii;
2021: nz = ai[row + 1] - ai[row];
2023: if (i < node_max - 1) {
2024: /* Prefetch the indices for the next block */
2025: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2026: /* Prefetch the data for the next block */
2027: PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2028: }
2030: switch (nsz) { /* Each loop in 'case' is unrolled */
2031: case 1:
2032: sum1 = b[r[row]];
2033: for (j = 0; j < nz - 1; j += 2) {
2034: i0 = vi[j];
2035: i1 = vi[j + 1];
2036: tmp0 = tmps[i0];
2037: tmp1 = tmps[i1];
2038: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2039: }
2040: if (j == nz - 1) {
2041: tmp0 = tmps[vi[j]];
2042: sum1 -= v1[j] * tmp0;
2043: }
2044: tmp[row++] = sum1;
2045: break;
2046: case 2:
2047: sum1 = b[r[row]];
2048: sum2 = b[r[row + 1]];
2049: v2 = aa + ai[row + 1];
2051: for (j = 0; j < nz - 1; j += 2) {
2052: i0 = vi[j];
2053: i1 = vi[j + 1];
2054: tmp0 = tmps[i0];
2055: tmp1 = tmps[i1];
2056: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2057: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2058: }
2059: if (j == nz - 1) {
2060: tmp0 = tmps[vi[j]];
2061: sum1 -= v1[j] * tmp0;
2062: sum2 -= v2[j] * tmp0;
2063: }
2064: sum2 -= v2[nz] * sum1;
2065: tmp[row++] = sum1;
2066: tmp[row++] = sum2;
2067: break;
2068: case 3:
2069: sum1 = b[r[row]];
2070: sum2 = b[r[row + 1]];
2071: sum3 = b[r[row + 2]];
2072: v2 = aa + ai[row + 1];
2073: v3 = aa + ai[row + 2];
2075: for (j = 0; j < nz - 1; j += 2) {
2076: i0 = vi[j];
2077: i1 = vi[j + 1];
2078: tmp0 = tmps[i0];
2079: tmp1 = tmps[i1];
2080: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2081: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2082: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2083: }
2084: if (j == nz - 1) {
2085: tmp0 = tmps[vi[j]];
2086: sum1 -= v1[j] * tmp0;
2087: sum2 -= v2[j] * tmp0;
2088: sum3 -= v3[j] * tmp0;
2089: }
2090: sum2 -= v2[nz] * sum1;
2091: sum3 -= v3[nz] * sum1;
2092: sum3 -= v3[nz + 1] * sum2;
2093: tmp[row++] = sum1;
2094: tmp[row++] = sum2;
2095: tmp[row++] = sum3;
2096: break;
2098: case 4:
2099: sum1 = b[r[row]];
2100: sum2 = b[r[row + 1]];
2101: sum3 = b[r[row + 2]];
2102: sum4 = b[r[row + 3]];
2103: v2 = aa + ai[row + 1];
2104: v3 = aa + ai[row + 2];
2105: v4 = aa + ai[row + 3];
2107: for (j = 0; j < nz - 1; j += 2) {
2108: i0 = vi[j];
2109: i1 = vi[j + 1];
2110: tmp0 = tmps[i0];
2111: tmp1 = tmps[i1];
2112: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2113: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2114: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2115: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2116: }
2117: if (j == nz - 1) {
2118: tmp0 = tmps[vi[j]];
2119: sum1 -= v1[j] * tmp0;
2120: sum2 -= v2[j] * tmp0;
2121: sum3 -= v3[j] * tmp0;
2122: sum4 -= v4[j] * tmp0;
2123: }
2124: sum2 -= v2[nz] * sum1;
2125: sum3 -= v3[nz] * sum1;
2126: sum4 -= v4[nz] * sum1;
2127: sum3 -= v3[nz + 1] * sum2;
2128: sum4 -= v4[nz + 1] * sum2;
2129: sum4 -= v4[nz + 2] * sum3;
2131: tmp[row++] = sum1;
2132: tmp[row++] = sum2;
2133: tmp[row++] = sum3;
2134: tmp[row++] = sum4;
2135: break;
2136: case 5:
2137: sum1 = b[r[row]];
2138: sum2 = b[r[row + 1]];
2139: sum3 = b[r[row + 2]];
2140: sum4 = b[r[row + 3]];
2141: sum5 = b[r[row + 4]];
2142: v2 = aa + ai[row + 1];
2143: v3 = aa + ai[row + 2];
2144: v4 = aa + ai[row + 3];
2145: v5 = aa + ai[row + 4];
2147: for (j = 0; j < nz - 1; j += 2) {
2148: i0 = vi[j];
2149: i1 = vi[j + 1];
2150: tmp0 = tmps[i0];
2151: tmp1 = tmps[i1];
2152: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2153: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2154: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2155: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2156: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2157: }
2158: if (j == nz - 1) {
2159: tmp0 = tmps[vi[j]];
2160: sum1 -= v1[j] * tmp0;
2161: sum2 -= v2[j] * tmp0;
2162: sum3 -= v3[j] * tmp0;
2163: sum4 -= v4[j] * tmp0;
2164: sum5 -= v5[j] * tmp0;
2165: }
2167: sum2 -= v2[nz] * sum1;
2168: sum3 -= v3[nz] * sum1;
2169: sum4 -= v4[nz] * sum1;
2170: sum5 -= v5[nz] * sum1;
2171: sum3 -= v3[nz + 1] * sum2;
2172: sum4 -= v4[nz + 1] * sum2;
2173: sum5 -= v5[nz + 1] * sum2;
2174: sum4 -= v4[nz + 2] * sum3;
2175: sum5 -= v5[nz + 2] * sum3;
2176: sum5 -= v5[nz + 3] * sum4;
2178: tmp[row++] = sum1;
2179: tmp[row++] = sum2;
2180: tmp[row++] = sum3;
2181: tmp[row++] = sum4;
2182: tmp[row++] = sum5;
2183: break;
2184: default:
2185: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2186: }
2187: }
2188: /* backward solve the upper triangular */
2189: for (i = node_max - 1; i >= 0; i--) {
2190: row = ns[i + 1] - 1;
2191: nsz = ns[i + 1] - ns[i];
2192: aii = ad[row + 1] + 1;
2193: v1 = aa + aii;
2194: vi = aj + aii;
2195: nz = ad[row] - ad[row + 1] - 1;
2197: if (i > 0) {
2198: /* Prefetch the indices for the next block */
2199: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2200: /* Prefetch the data for the next block */
2201: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2202: }
2204: switch (nsz) { /* Each loop in 'case' is unrolled */
2205: case 1:
2206: sum1 = tmp[row];
2208: for (j = 0; j < nz - 1; j += 2) {
2209: i0 = vi[j];
2210: i1 = vi[j + 1];
2211: tmp0 = tmps[i0];
2212: tmp1 = tmps[i1];
2213: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2214: }
2215: if (j == nz - 1) {
2216: tmp0 = tmps[vi[j]];
2217: sum1 -= v1[j] * tmp0;
2218: }
2219: x[c[row]] = tmp[row] = sum1 * v1[nz];
2220: row--;
2221: break;
2222: case 2:
2223: sum1 = tmp[row];
2224: sum2 = tmp[row - 1];
2225: v2 = aa + ad[row] + 1;
2226: for (j = 0; j < nz - 1; j += 2) {
2227: i0 = vi[j];
2228: i1 = vi[j + 1];
2229: tmp0 = tmps[i0];
2230: tmp1 = tmps[i1];
2231: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2232: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2233: }
2234: if (j == nz - 1) {
2235: tmp0 = tmps[vi[j]];
2236: sum1 -= v1[j] * tmp0;
2237: sum2 -= v2[j + 1] * tmp0;
2238: }
2240: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2241: row--;
2242: sum2 -= v2[0] * tmp0;
2243: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2244: row--;
2245: break;
2246: case 3:
2247: sum1 = tmp[row];
2248: sum2 = tmp[row - 1];
2249: sum3 = tmp[row - 2];
2250: v2 = aa + ad[row] + 1;
2251: v3 = aa + ad[row - 1] + 1;
2252: for (j = 0; j < nz - 1; j += 2) {
2253: i0 = vi[j];
2254: i1 = vi[j + 1];
2255: tmp0 = tmps[i0];
2256: tmp1 = tmps[i1];
2257: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2258: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2259: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2260: }
2261: if (j == nz - 1) {
2262: tmp0 = tmps[vi[j]];
2263: sum1 -= v1[j] * tmp0;
2264: sum2 -= v2[j + 1] * tmp0;
2265: sum3 -= v3[j + 2] * tmp0;
2266: }
2267: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2268: row--;
2269: sum2 -= v2[0] * tmp0;
2270: sum3 -= v3[1] * tmp0;
2271: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2272: row--;
2273: sum3 -= v3[0] * tmp0;
2274: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2275: row--;
2277: break;
2278: case 4:
2279: sum1 = tmp[row];
2280: sum2 = tmp[row - 1];
2281: sum3 = tmp[row - 2];
2282: sum4 = tmp[row - 3];
2283: v2 = aa + ad[row] + 1;
2284: v3 = aa + ad[row - 1] + 1;
2285: v4 = aa + ad[row - 2] + 1;
2287: for (j = 0; j < nz - 1; j += 2) {
2288: i0 = vi[j];
2289: i1 = vi[j + 1];
2290: tmp0 = tmps[i0];
2291: tmp1 = tmps[i1];
2292: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2293: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2294: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2295: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2296: }
2297: if (j == nz - 1) {
2298: tmp0 = tmps[vi[j]];
2299: sum1 -= v1[j] * tmp0;
2300: sum2 -= v2[j + 1] * tmp0;
2301: sum3 -= v3[j + 2] * tmp0;
2302: sum4 -= v4[j + 3] * tmp0;
2303: }
2305: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2306: row--;
2307: sum2 -= v2[0] * tmp0;
2308: sum3 -= v3[1] * tmp0;
2309: sum4 -= v4[2] * tmp0;
2310: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2311: row--;
2312: sum3 -= v3[0] * tmp0;
2313: sum4 -= v4[1] * tmp0;
2314: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2315: row--;
2316: sum4 -= v4[0] * tmp0;
2317: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2318: row--;
2319: break;
2320: case 5:
2321: sum1 = tmp[row];
2322: sum2 = tmp[row - 1];
2323: sum3 = tmp[row - 2];
2324: sum4 = tmp[row - 3];
2325: sum5 = tmp[row - 4];
2326: v2 = aa + ad[row] + 1;
2327: v3 = aa + ad[row - 1] + 1;
2328: v4 = aa + ad[row - 2] + 1;
2329: v5 = aa + ad[row - 3] + 1;
2330: for (j = 0; j < nz - 1; j += 2) {
2331: i0 = vi[j];
2332: i1 = vi[j + 1];
2333: tmp0 = tmps[i0];
2334: tmp1 = tmps[i1];
2335: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2336: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2337: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2338: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2339: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2340: }
2341: if (j == nz - 1) {
2342: tmp0 = tmps[vi[j]];
2343: sum1 -= v1[j] * tmp0;
2344: sum2 -= v2[j + 1] * tmp0;
2345: sum3 -= v3[j + 2] * tmp0;
2346: sum4 -= v4[j + 3] * tmp0;
2347: sum5 -= v5[j + 4] * tmp0;
2348: }
2350: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2351: row--;
2352: sum2 -= v2[0] * tmp0;
2353: sum3 -= v3[1] * tmp0;
2354: sum4 -= v4[2] * tmp0;
2355: sum5 -= v5[3] * tmp0;
2356: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2357: row--;
2358: sum3 -= v3[0] * tmp0;
2359: sum4 -= v4[1] * tmp0;
2360: sum5 -= v5[2] * tmp0;
2361: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2362: row--;
2363: sum4 -= v4[0] * tmp0;
2364: sum5 -= v5[1] * tmp0;
2365: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2366: row--;
2367: sum5 -= v5[0] * tmp0;
2368: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2369: row--;
2370: break;
2371: default:
2372: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2373: }
2374: }
2375: PetscCall(ISRestoreIndices(isrow, &rout));
2376: PetscCall(ISRestoreIndices(iscol, &cout));
2377: PetscCall(VecRestoreArrayRead(bb, &b));
2378: PetscCall(VecRestoreArrayWrite(xx, &x));
2379: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2380: PetscFunctionReturn(PETSC_SUCCESS);
2381: }
2383: /*
2384: Makes a longer coloring[] array and calls the usual code with that
2385: */
2386: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2387: {
2388: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2389: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2390: PetscInt *colorused, i;
2391: ISColoringValue *newcolor;
2393: PetscFunctionBegin;
2394: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2395: PetscCall(PetscMalloc1(n + 1, &newcolor));
2396: /* loop over inodes, marking a color for each column*/
2397: row = 0;
2398: for (i = 0; i < m; i++) {
2399: for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2400: }
2402: /* eliminate unneeded colors */
2403: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2404: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2406: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2407: ncolors = colorused[5 * ncolors - 1];
2408: for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2409: PetscCall(PetscFree(colorused));
2410: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2411: PetscCall(PetscFree(coloring));
2412: PetscFunctionReturn(PETSC_SUCCESS);
2413: }
2415: #include <petsc/private/kernels/blockinvert.h>
2417: /*
2418: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
2419: */
2420: static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2421: {
2422: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2423: MatScalar *ibdiag, *bdiag, work[25];
2424: const MatScalar *v = a->a;
2425: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2426: PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
2427: PetscInt k, ipvt[5];
2428: PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2429: const PetscInt *sizes = a->inode.size_csr, *diag;
2431: PetscFunctionBegin;
2432: if (a->inode.ibdiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2433: PetscCall(PetscInfo(A, "%s inode block-diagonal inverse for SOR\n", a->inode.ibdiag ? "Recomputing" : "Computing"));
2434: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL));
2435: if (!a->inode.ibdiag) {
2436: /* calculate space needed for diagonal blocks */
2437: for (i = 0; i < m; i++) {
2438: nodesz = sizes[i + 1] - sizes[i];
2439: cnt += nodesz * nodesz;
2440: }
2441: a->inode.bdiagsize = cnt;
2442: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2443: }
2445: /* copy over the diagonal blocks and invert them */
2446: ibdiag = a->inode.ibdiag;
2447: bdiag = a->inode.bdiag;
2448: cnt = 0;
2449: for (i = 0, row = 0; i < m; i++) {
2450: nodesz = sizes[i + 1] - sizes[i];
2451: for (j = 0; j < nodesz; j++) {
2452: for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2453: }
2454: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2456: switch (nodesz) {
2457: case 1:
2458: /* Create matrix data structure */
2459: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2460: PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2461: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2462: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2463: A->factorerror_zeropivot_row = row;
2464: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2465: }
2466: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2467: break;
2468: case 2:
2469: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2470: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2471: break;
2472: case 3:
2473: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2474: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2475: break;
2476: case 4:
2477: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2478: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2479: break;
2480: case 5:
2481: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2482: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2483: break;
2484: default:
2485: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2486: }
2487: cnt += nodesz * nodesz;
2488: row += nodesz;
2489: }
2490: a->inode.ibdiagState = ((PetscObject)A)->state;
2491: PetscFunctionReturn(PETSC_SUCCESS);
2492: }
2494: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2495: {
2496: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2497: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2498: MatScalar *ibdiag, *bdiag, *t;
2499: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2500: const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2501: const PetscScalar *xb, *b;
2502: PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
2503: PetscInt sz;
2504: const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
2506: PetscFunctionBegin;
2507: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2508: /* The PetscChecks below guarantee omega == 1.0 and fshift == 0.0 by the
2509: time MatInvertDiagonalForSOR_SeqAIJ_Inode() runs, so that helper's cache
2510: key omits omega and fshift. If these checks are ever relaxed to support
2511: omega != 1.0 or fshift != 0.0 on the inode path, the cache there must
2512: also be re-keyed on (state, omega, fshift) the way
2513: MatInvertDiagonalForSOR_SeqAIJ() in aij.c does. */
2514: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2515: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2516: PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
2517: diag = a->diag;
2519: ibdiag = a->inode.ibdiag;
2520: bdiag = a->inode.bdiag;
2521: t = a->inode.ssor_work;
2523: PetscCall(VecGetArray(xx, &x));
2524: PetscCall(VecGetArrayRead(bb, &b));
2525: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2526: if (flag & SOR_ZERO_INITIAL_GUESS) {
2527: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2528: for (i = 0, row = 0; i < m; i++) {
2529: sz = diag[row] - ii[row];
2530: v1 = a->a + ii[row];
2531: idx = a->j + ii[row];
2533: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2534: nodesz = sizes[i + 1] - sizes[i];
2535: switch (nodesz) {
2536: case 1:
2538: sum1 = b[row];
2539: for (n = 0; n < sz - 1; n += 2) {
2540: i1 = idx[0];
2541: i2 = idx[1];
2542: idx += 2;
2543: tmp0 = x[i1];
2544: tmp1 = x[i2];
2545: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2546: v1 += 2;
2547: }
2549: if (n == sz - 1) {
2550: tmp0 = x[*idx];
2551: sum1 -= *v1 * tmp0;
2552: }
2553: t[row] = sum1;
2554: x[row++] = sum1 * (*ibdiag++);
2555: break;
2556: case 2:
2557: v2 = a->a + ii[row + 1];
2558: sum1 = b[row];
2559: sum2 = b[row + 1];
2560: for (n = 0; n < sz - 1; n += 2) {
2561: i1 = idx[0];
2562: i2 = idx[1];
2563: idx += 2;
2564: tmp0 = x[i1];
2565: tmp1 = x[i2];
2566: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2567: v1 += 2;
2568: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2569: v2 += 2;
2570: }
2572: if (n == sz - 1) {
2573: tmp0 = x[*idx];
2574: sum1 -= v1[0] * tmp0;
2575: sum2 -= v2[0] * tmp0;
2576: }
2577: t[row] = sum1;
2578: t[row + 1] = sum2;
2579: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2580: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2581: ibdiag += 4;
2582: break;
2583: case 3:
2584: v2 = a->a + ii[row + 1];
2585: v3 = a->a + ii[row + 2];
2586: sum1 = b[row];
2587: sum2 = b[row + 1];
2588: sum3 = b[row + 2];
2589: for (n = 0; n < sz - 1; n += 2) {
2590: i1 = idx[0];
2591: i2 = idx[1];
2592: idx += 2;
2593: tmp0 = x[i1];
2594: tmp1 = x[i2];
2595: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2596: v1 += 2;
2597: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2598: v2 += 2;
2599: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2600: v3 += 2;
2601: }
2603: if (n == sz - 1) {
2604: tmp0 = x[*idx];
2605: sum1 -= v1[0] * tmp0;
2606: sum2 -= v2[0] * tmp0;
2607: sum3 -= v3[0] * tmp0;
2608: }
2609: t[row] = sum1;
2610: t[row + 1] = sum2;
2611: t[row + 2] = sum3;
2612: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2613: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2614: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2615: ibdiag += 9;
2616: break;
2617: case 4:
2618: v2 = a->a + ii[row + 1];
2619: v3 = a->a + ii[row + 2];
2620: v4 = a->a + ii[row + 3];
2621: sum1 = b[row];
2622: sum2 = b[row + 1];
2623: sum3 = b[row + 2];
2624: sum4 = b[row + 3];
2625: for (n = 0; n < sz - 1; n += 2) {
2626: i1 = idx[0];
2627: i2 = idx[1];
2628: idx += 2;
2629: tmp0 = x[i1];
2630: tmp1 = x[i2];
2631: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2632: v1 += 2;
2633: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2634: v2 += 2;
2635: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2636: v3 += 2;
2637: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2638: v4 += 2;
2639: }
2641: if (n == sz - 1) {
2642: tmp0 = x[*idx];
2643: sum1 -= v1[0] * tmp0;
2644: sum2 -= v2[0] * tmp0;
2645: sum3 -= v3[0] * tmp0;
2646: sum4 -= v4[0] * tmp0;
2647: }
2648: t[row] = sum1;
2649: t[row + 1] = sum2;
2650: t[row + 2] = sum3;
2651: t[row + 3] = sum4;
2652: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
2653: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
2654: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
2655: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
2656: ibdiag += 16;
2657: break;
2658: case 5:
2659: v2 = a->a + ii[row + 1];
2660: v3 = a->a + ii[row + 2];
2661: v4 = a->a + ii[row + 3];
2662: v5 = a->a + ii[row + 4];
2663: sum1 = b[row];
2664: sum2 = b[row + 1];
2665: sum3 = b[row + 2];
2666: sum4 = b[row + 3];
2667: sum5 = b[row + 4];
2668: for (n = 0; n < sz - 1; n += 2) {
2669: i1 = idx[0];
2670: i2 = idx[1];
2671: idx += 2;
2672: tmp0 = x[i1];
2673: tmp1 = x[i2];
2674: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2675: v1 += 2;
2676: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2677: v2 += 2;
2678: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2679: v3 += 2;
2680: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2681: v4 += 2;
2682: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2683: v5 += 2;
2684: }
2686: if (n == sz - 1) {
2687: tmp0 = x[*idx];
2688: sum1 -= v1[0] * tmp0;
2689: sum2 -= v2[0] * tmp0;
2690: sum3 -= v3[0] * tmp0;
2691: sum4 -= v4[0] * tmp0;
2692: sum5 -= v5[0] * tmp0;
2693: }
2694: t[row] = sum1;
2695: t[row + 1] = sum2;
2696: t[row + 2] = sum3;
2697: t[row + 3] = sum4;
2698: t[row + 4] = sum5;
2699: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
2700: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
2701: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
2702: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
2703: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
2704: ibdiag += 25;
2705: break;
2706: default:
2707: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2708: }
2709: }
2711: xb = t;
2712: PetscCall(PetscLogFlops(a->nz));
2713: } else xb = b;
2714: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2715: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
2716: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
2717: nodesz = sizes[i + 1] - sizes[i];
2718: ibdiag -= nodesz * nodesz;
2719: sz = ii[row + 1] - diag[row] - 1;
2720: v1 = a->a + diag[row] + 1;
2721: idx = a->j + diag[row] + 1;
2723: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2724: switch (nodesz) {
2725: case 1:
2727: sum1 = xb[row];
2728: for (n = 0; n < sz - 1; n += 2) {
2729: i1 = idx[0];
2730: i2 = idx[1];
2731: idx += 2;
2732: tmp0 = x[i1];
2733: tmp1 = x[i2];
2734: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2735: v1 += 2;
2736: }
2738: if (n == sz - 1) {
2739: tmp0 = x[*idx];
2740: sum1 -= *v1 * tmp0;
2741: }
2742: x[row--] = sum1 * (*ibdiag);
2743: break;
2745: case 2:
2747: sum1 = xb[row];
2748: sum2 = xb[row - 1];
2749: /* note that sum1 is associated with the second of the two rows */
2750: v2 = a->a + diag[row - 1] + 2;
2751: for (n = 0; n < sz - 1; n += 2) {
2752: i1 = idx[0];
2753: i2 = idx[1];
2754: idx += 2;
2755: tmp0 = x[i1];
2756: tmp1 = x[i2];
2757: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2758: v1 += 2;
2759: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2760: v2 += 2;
2761: }
2763: if (n == sz - 1) {
2764: tmp0 = x[*idx];
2765: sum1 -= *v1 * tmp0;
2766: sum2 -= *v2 * tmp0;
2767: }
2768: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
2769: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
2770: break;
2771: case 3:
2773: sum1 = xb[row];
2774: sum2 = xb[row - 1];
2775: sum3 = xb[row - 2];
2776: v2 = a->a + diag[row - 1] + 2;
2777: v3 = a->a + diag[row - 2] + 3;
2778: for (n = 0; n < sz - 1; n += 2) {
2779: i1 = idx[0];
2780: i2 = idx[1];
2781: idx += 2;
2782: tmp0 = x[i1];
2783: tmp1 = x[i2];
2784: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2785: v1 += 2;
2786: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2787: v2 += 2;
2788: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2789: v3 += 2;
2790: }
2792: if (n == sz - 1) {
2793: tmp0 = x[*idx];
2794: sum1 -= *v1 * tmp0;
2795: sum2 -= *v2 * tmp0;
2796: sum3 -= *v3 * tmp0;
2797: }
2798: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
2799: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
2800: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
2801: break;
2802: case 4:
2804: sum1 = xb[row];
2805: sum2 = xb[row - 1];
2806: sum3 = xb[row - 2];
2807: sum4 = xb[row - 3];
2808: v2 = a->a + diag[row - 1] + 2;
2809: v3 = a->a + diag[row - 2] + 3;
2810: v4 = a->a + diag[row - 3] + 4;
2811: for (n = 0; n < sz - 1; n += 2) {
2812: i1 = idx[0];
2813: i2 = idx[1];
2814: idx += 2;
2815: tmp0 = x[i1];
2816: tmp1 = x[i2];
2817: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2818: v1 += 2;
2819: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2820: v2 += 2;
2821: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2822: v3 += 2;
2823: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2824: v4 += 2;
2825: }
2827: if (n == sz - 1) {
2828: tmp0 = x[*idx];
2829: sum1 -= *v1 * tmp0;
2830: sum2 -= *v2 * tmp0;
2831: sum3 -= *v3 * tmp0;
2832: sum4 -= *v4 * tmp0;
2833: }
2834: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
2835: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
2836: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
2837: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
2838: break;
2839: case 5:
2841: sum1 = xb[row];
2842: sum2 = xb[row - 1];
2843: sum3 = xb[row - 2];
2844: sum4 = xb[row - 3];
2845: sum5 = xb[row - 4];
2846: v2 = a->a + diag[row - 1] + 2;
2847: v3 = a->a + diag[row - 2] + 3;
2848: v4 = a->a + diag[row - 3] + 4;
2849: v5 = a->a + diag[row - 4] + 5;
2850: for (n = 0; n < sz - 1; n += 2) {
2851: i1 = idx[0];
2852: i2 = idx[1];
2853: idx += 2;
2854: tmp0 = x[i1];
2855: tmp1 = x[i2];
2856: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2857: v1 += 2;
2858: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2859: v2 += 2;
2860: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2861: v3 += 2;
2862: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2863: v4 += 2;
2864: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2865: v5 += 2;
2866: }
2868: if (n == sz - 1) {
2869: tmp0 = x[*idx];
2870: sum1 -= *v1 * tmp0;
2871: sum2 -= *v2 * tmp0;
2872: sum3 -= *v3 * tmp0;
2873: sum4 -= *v4 * tmp0;
2874: sum5 -= *v5 * tmp0;
2875: }
2876: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
2877: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
2878: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
2879: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
2880: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
2881: break;
2882: default:
2883: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2884: }
2885: }
2887: PetscCall(PetscLogFlops(a->nz));
2888: }
2889: its--;
2890: }
2891: while (its--) {
2892: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2893: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
2894: nodesz = sizes[i + 1] - sizes[i];
2895: sz = diag[row] - ii[row];
2896: v1 = a->a + ii[row];
2897: idx = a->j + ii[row];
2898: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2899: switch (nodesz) {
2900: case 1:
2901: sum1 = b[row];
2902: for (n = 0; n < sz - 1; n += 2) {
2903: i1 = idx[0];
2904: i2 = idx[1];
2905: idx += 2;
2906: tmp0 = x[i1];
2907: tmp1 = x[i2];
2908: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2909: v1 += 2;
2910: }
2911: if (n == sz - 1) {
2912: tmp0 = x[*idx++];
2913: sum1 -= *v1 * tmp0;
2914: v1++;
2915: }
2916: t[row] = sum1;
2917: sz = ii[row + 1] - diag[row] - 1;
2918: idx = a->j + diag[row] + 1;
2919: v1 += 1;
2920: for (n = 0; n < sz - 1; n += 2) {
2921: i1 = idx[0];
2922: i2 = idx[1];
2923: idx += 2;
2924: tmp0 = x[i1];
2925: tmp1 = x[i2];
2926: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2927: v1 += 2;
2928: }
2929: if (n == sz - 1) {
2930: tmp0 = x[*idx++];
2931: sum1 -= *v1 * tmp0;
2932: }
2933: /* in MatSOR_SeqAIJ this line would be
2934: *
2935: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
2936: *
2937: * but omega == 1, so this becomes
2938: *
2939: * x[row] = sum1*(*ibdiag++);
2940: *
2941: */
2942: x[row] = sum1 * (*ibdiag);
2943: break;
2944: case 2:
2945: v2 = a->a + ii[row + 1];
2946: sum1 = b[row];
2947: sum2 = b[row + 1];
2948: for (n = 0; n < sz - 1; n += 2) {
2949: i1 = idx[0];
2950: i2 = idx[1];
2951: idx += 2;
2952: tmp0 = x[i1];
2953: tmp1 = x[i2];
2954: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2955: v1 += 2;
2956: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2957: v2 += 2;
2958: }
2959: if (n == sz - 1) {
2960: tmp0 = x[*idx++];
2961: sum1 -= v1[0] * tmp0;
2962: sum2 -= v2[0] * tmp0;
2963: v1++;
2964: v2++;
2965: }
2966: t[row] = sum1;
2967: t[row + 1] = sum2;
2968: sz = ii[row + 1] - diag[row] - 2;
2969: idx = a->j + diag[row] + 2;
2970: v1 += 2;
2971: v2 += 2;
2972: for (n = 0; n < sz - 1; n += 2) {
2973: i1 = idx[0];
2974: i2 = idx[1];
2975: idx += 2;
2976: tmp0 = x[i1];
2977: tmp1 = x[i2];
2978: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2979: v1 += 2;
2980: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2981: v2 += 2;
2982: }
2983: if (n == sz - 1) {
2984: tmp0 = x[*idx];
2985: sum1 -= v1[0] * tmp0;
2986: sum2 -= v2[0] * tmp0;
2987: }
2988: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2989: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2990: break;
2991: case 3:
2992: v2 = a->a + ii[row + 1];
2993: v3 = a->a + ii[row + 2];
2994: sum1 = b[row];
2995: sum2 = b[row + 1];
2996: sum3 = b[row + 2];
2997: for (n = 0; n < sz - 1; n += 2) {
2998: i1 = idx[0];
2999: i2 = idx[1];
3000: idx += 2;
3001: tmp0 = x[i1];
3002: tmp1 = x[i2];
3003: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3004: v1 += 2;
3005: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3006: v2 += 2;
3007: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3008: v3 += 2;
3009: }
3010: if (n == sz - 1) {
3011: tmp0 = x[*idx++];
3012: sum1 -= v1[0] * tmp0;
3013: sum2 -= v2[0] * tmp0;
3014: sum3 -= v3[0] * tmp0;
3015: v1++;
3016: v2++;
3017: v3++;
3018: }
3019: t[row] = sum1;
3020: t[row + 1] = sum2;
3021: t[row + 2] = sum3;
3022: sz = ii[row + 1] - diag[row] - 3;
3023: idx = a->j + diag[row] + 3;
3024: v1 += 3;
3025: v2 += 3;
3026: v3 += 3;
3027: for (n = 0; n < sz - 1; n += 2) {
3028: i1 = idx[0];
3029: i2 = idx[1];
3030: idx += 2;
3031: tmp0 = x[i1];
3032: tmp1 = x[i2];
3033: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3034: v1 += 2;
3035: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3036: v2 += 2;
3037: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3038: v3 += 2;
3039: }
3040: if (n == sz - 1) {
3041: tmp0 = x[*idx];
3042: sum1 -= v1[0] * tmp0;
3043: sum2 -= v2[0] * tmp0;
3044: sum3 -= v3[0] * tmp0;
3045: }
3046: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3047: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3048: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3049: break;
3050: case 4:
3051: v2 = a->a + ii[row + 1];
3052: v3 = a->a + ii[row + 2];
3053: v4 = a->a + ii[row + 3];
3054: sum1 = b[row];
3055: sum2 = b[row + 1];
3056: sum3 = b[row + 2];
3057: sum4 = b[row + 3];
3058: for (n = 0; n < sz - 1; n += 2) {
3059: i1 = idx[0];
3060: i2 = idx[1];
3061: idx += 2;
3062: tmp0 = x[i1];
3063: tmp1 = x[i2];
3064: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3065: v1 += 2;
3066: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3067: v2 += 2;
3068: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3069: v3 += 2;
3070: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3071: v4 += 2;
3072: }
3073: if (n == sz - 1) {
3074: tmp0 = x[*idx++];
3075: sum1 -= v1[0] * tmp0;
3076: sum2 -= v2[0] * tmp0;
3077: sum3 -= v3[0] * tmp0;
3078: sum4 -= v4[0] * tmp0;
3079: v1++;
3080: v2++;
3081: v3++;
3082: v4++;
3083: }
3084: t[row] = sum1;
3085: t[row + 1] = sum2;
3086: t[row + 2] = sum3;
3087: t[row + 3] = sum4;
3088: sz = ii[row + 1] - diag[row] - 4;
3089: idx = a->j + diag[row] + 4;
3090: v1 += 4;
3091: v2 += 4;
3092: v3 += 4;
3093: v4 += 4;
3094: for (n = 0; n < sz - 1; n += 2) {
3095: i1 = idx[0];
3096: i2 = idx[1];
3097: idx += 2;
3098: tmp0 = x[i1];
3099: tmp1 = x[i2];
3100: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3101: v1 += 2;
3102: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3103: v2 += 2;
3104: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3105: v3 += 2;
3106: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3107: v4 += 2;
3108: }
3109: if (n == sz - 1) {
3110: tmp0 = x[*idx];
3111: sum1 -= v1[0] * tmp0;
3112: sum2 -= v2[0] * tmp0;
3113: sum3 -= v3[0] * tmp0;
3114: sum4 -= v4[0] * tmp0;
3115: }
3116: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3117: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3118: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3119: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3120: break;
3121: case 5:
3122: v2 = a->a + ii[row + 1];
3123: v3 = a->a + ii[row + 2];
3124: v4 = a->a + ii[row + 3];
3125: v5 = a->a + ii[row + 4];
3126: sum1 = b[row];
3127: sum2 = b[row + 1];
3128: sum3 = b[row + 2];
3129: sum4 = b[row + 3];
3130: sum5 = b[row + 4];
3131: for (n = 0; n < sz - 1; n += 2) {
3132: i1 = idx[0];
3133: i2 = idx[1];
3134: idx += 2;
3135: tmp0 = x[i1];
3136: tmp1 = x[i2];
3137: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3138: v1 += 2;
3139: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3140: v2 += 2;
3141: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3142: v3 += 2;
3143: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3144: v4 += 2;
3145: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3146: v5 += 2;
3147: }
3148: if (n == sz - 1) {
3149: tmp0 = x[*idx++];
3150: sum1 -= v1[0] * tmp0;
3151: sum2 -= v2[0] * tmp0;
3152: sum3 -= v3[0] * tmp0;
3153: sum4 -= v4[0] * tmp0;
3154: sum5 -= v5[0] * tmp0;
3155: v1++;
3156: v2++;
3157: v3++;
3158: v4++;
3159: v5++;
3160: }
3161: t[row] = sum1;
3162: t[row + 1] = sum2;
3163: t[row + 2] = sum3;
3164: t[row + 3] = sum4;
3165: t[row + 4] = sum5;
3166: sz = ii[row + 1] - diag[row] - 5;
3167: idx = a->j + diag[row] + 5;
3168: v1 += 5;
3169: v2 += 5;
3170: v3 += 5;
3171: v4 += 5;
3172: v5 += 5;
3173: for (n = 0; n < sz - 1; n += 2) {
3174: i1 = idx[0];
3175: i2 = idx[1];
3176: idx += 2;
3177: tmp0 = x[i1];
3178: tmp1 = x[i2];
3179: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3180: v1 += 2;
3181: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3182: v2 += 2;
3183: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3184: v3 += 2;
3185: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3186: v4 += 2;
3187: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3188: v5 += 2;
3189: }
3190: if (n == sz - 1) {
3191: tmp0 = x[*idx];
3192: sum1 -= v1[0] * tmp0;
3193: sum2 -= v2[0] * tmp0;
3194: sum3 -= v3[0] * tmp0;
3195: sum4 -= v4[0] * tmp0;
3196: sum5 -= v5[0] * tmp0;
3197: }
3198: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3199: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3200: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3201: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3202: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3203: break;
3204: default:
3205: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3206: }
3207: }
3208: xb = t;
3209: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3210: } else xb = b;
3212: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3213: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3214: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3215: nodesz = sizes[i + 1] - sizes[i];
3216: ibdiag -= nodesz * nodesz;
3218: /* set RHS */
3219: if (xb == b) {
3220: /* whole (old way) */
3221: sz = ii[row + 1] - ii[row];
3222: idx = a->j + ii[row];
3223: switch (nodesz) {
3224: case 5:
3225: v5 = a->a + ii[row - 4]; /* fall through */
3226: case 4:
3227: v4 = a->a + ii[row - 3]; /* fall through */
3228: case 3:
3229: v3 = a->a + ii[row - 2]; /* fall through */
3230: case 2:
3231: v2 = a->a + ii[row - 1]; /* fall through */
3232: case 1:
3233: v1 = a->a + ii[row];
3234: break;
3235: default:
3236: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3237: }
3238: } else {
3239: /* upper, no diag */
3240: sz = ii[row + 1] - diag[row] - 1;
3241: idx = a->j + diag[row] + 1;
3242: switch (nodesz) {
3243: case 5:
3244: v5 = a->a + diag[row - 4] + 5; /* fall through */
3245: case 4:
3246: v4 = a->a + diag[row - 3] + 4; /* fall through */
3247: case 3:
3248: v3 = a->a + diag[row - 2] + 3; /* fall through */
3249: case 2:
3250: v2 = a->a + diag[row - 1] + 2; /* fall through */
3251: case 1:
3252: v1 = a->a + diag[row] + 1;
3253: }
3254: }
3255: /* set sum */
3256: switch (nodesz) {
3257: case 5:
3258: sum5 = xb[row - 4]; /* fall through */
3259: case 4:
3260: sum4 = xb[row - 3]; /* fall through */
3261: case 3:
3262: sum3 = xb[row - 2]; /* fall through */
3263: case 2:
3264: sum2 = xb[row - 1]; /* fall through */
3265: case 1:
3266: /* note that sum1 is associated with the last row */
3267: sum1 = xb[row];
3268: }
3269: /* do sums */
3270: for (n = 0; n < sz - 1; n += 2) {
3271: i1 = idx[0];
3272: i2 = idx[1];
3273: idx += 2;
3274: tmp0 = x[i1];
3275: tmp1 = x[i2];
3276: switch (nodesz) {
3277: case 5:
3278: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3279: v5 += 2; /* fall through */
3280: case 4:
3281: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3282: v4 += 2; /* fall through */
3283: case 3:
3284: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3285: v3 += 2; /* fall through */
3286: case 2:
3287: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3288: v2 += 2; /* fall through */
3289: case 1:
3290: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3291: v1 += 2;
3292: }
3293: }
3294: /* ragged edge */
3295: if (n == sz - 1) {
3296: tmp0 = x[*idx];
3297: switch (nodesz) {
3298: case 5:
3299: sum5 -= *v5 * tmp0; /* fall through */
3300: case 4:
3301: sum4 -= *v4 * tmp0; /* fall through */
3302: case 3:
3303: sum3 -= *v3 * tmp0; /* fall through */
3304: case 2:
3305: sum2 -= *v2 * tmp0; /* fall through */
3306: case 1:
3307: sum1 -= *v1 * tmp0;
3308: }
3309: }
3310: /* update */
3311: if (xb == b) {
3312: /* whole (old way) w/ diag */
3313: switch (nodesz) {
3314: case 5:
3315: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3316: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3317: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3318: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3319: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3320: break;
3321: case 4:
3322: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3323: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3324: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3325: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3326: break;
3327: case 3:
3328: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3329: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3330: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3331: break;
3332: case 2:
3333: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3334: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3335: break;
3336: case 1:
3337: x[row--] += sum1 * (*ibdiag);
3338: break;
3339: }
3340: } else {
3341: /* no diag so set = */
3342: switch (nodesz) {
3343: case 5:
3344: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3345: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3346: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3347: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3348: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3349: break;
3350: case 4:
3351: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3352: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3353: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3354: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3355: break;
3356: case 3:
3357: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3358: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3359: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3360: break;
3361: case 2:
3362: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3363: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3364: break;
3365: case 1:
3366: x[row--] = sum1 * (*ibdiag);
3367: break;
3368: }
3369: }
3370: }
3371: if (xb == b) PetscCall(PetscLogFlops(2.0 * a->nz));
3372: else PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3373: }
3374: }
3375: if (flag & SOR_EISENSTAT) {
3376: /*
3377: Apply (U + D)^-1 where D is now the block diagonal
3378: */
3379: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3380: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3381: nodesz = sizes[i + 1] - sizes[i];
3382: ibdiag -= nodesz * nodesz;
3383: sz = ii[row + 1] - diag[row] - 1;
3384: v1 = a->a + diag[row] + 1;
3385: idx = a->j + diag[row] + 1;
3386: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3387: switch (nodesz) {
3388: case 1:
3390: sum1 = b[row];
3391: for (n = 0; n < sz - 1; n += 2) {
3392: i1 = idx[0];
3393: i2 = idx[1];
3394: idx += 2;
3395: tmp0 = x[i1];
3396: tmp1 = x[i2];
3397: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3398: v1 += 2;
3399: }
3401: if (n == sz - 1) {
3402: tmp0 = x[*idx];
3403: sum1 -= *v1 * tmp0;
3404: }
3405: x[row] = sum1 * (*ibdiag);
3406: row--;
3407: break;
3409: case 2:
3411: sum1 = b[row];
3412: sum2 = b[row - 1];
3413: /* note that sum1 is associated with the second of the two rows */
3414: v2 = a->a + diag[row - 1] + 2;
3415: for (n = 0; n < sz - 1; n += 2) {
3416: i1 = idx[0];
3417: i2 = idx[1];
3418: idx += 2;
3419: tmp0 = x[i1];
3420: tmp1 = x[i2];
3421: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3422: v1 += 2;
3423: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3424: v2 += 2;
3425: }
3427: if (n == sz - 1) {
3428: tmp0 = x[*idx];
3429: sum1 -= *v1 * tmp0;
3430: sum2 -= *v2 * tmp0;
3431: }
3432: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3433: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3434: row -= 2;
3435: break;
3436: case 3:
3438: sum1 = b[row];
3439: sum2 = b[row - 1];
3440: sum3 = b[row - 2];
3441: v2 = a->a + diag[row - 1] + 2;
3442: v3 = a->a + diag[row - 2] + 3;
3443: for (n = 0; n < sz - 1; n += 2) {
3444: i1 = idx[0];
3445: i2 = idx[1];
3446: idx += 2;
3447: tmp0 = x[i1];
3448: tmp1 = x[i2];
3449: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3450: v1 += 2;
3451: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3452: v2 += 2;
3453: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3454: v3 += 2;
3455: }
3457: if (n == sz - 1) {
3458: tmp0 = x[*idx];
3459: sum1 -= *v1 * tmp0;
3460: sum2 -= *v2 * tmp0;
3461: sum3 -= *v3 * tmp0;
3462: }
3463: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3464: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3465: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3466: row -= 3;
3467: break;
3468: case 4:
3470: sum1 = b[row];
3471: sum2 = b[row - 1];
3472: sum3 = b[row - 2];
3473: sum4 = b[row - 3];
3474: v2 = a->a + diag[row - 1] + 2;
3475: v3 = a->a + diag[row - 2] + 3;
3476: v4 = a->a + diag[row - 3] + 4;
3477: for (n = 0; n < sz - 1; n += 2) {
3478: i1 = idx[0];
3479: i2 = idx[1];
3480: idx += 2;
3481: tmp0 = x[i1];
3482: tmp1 = x[i2];
3483: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3484: v1 += 2;
3485: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3486: v2 += 2;
3487: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3488: v3 += 2;
3489: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3490: v4 += 2;
3491: }
3493: if (n == sz - 1) {
3494: tmp0 = x[*idx];
3495: sum1 -= *v1 * tmp0;
3496: sum2 -= *v2 * tmp0;
3497: sum3 -= *v3 * tmp0;
3498: sum4 -= *v4 * tmp0;
3499: }
3500: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3501: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3502: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3503: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3504: row -= 4;
3505: break;
3506: case 5:
3508: sum1 = b[row];
3509: sum2 = b[row - 1];
3510: sum3 = b[row - 2];
3511: sum4 = b[row - 3];
3512: sum5 = b[row - 4];
3513: v2 = a->a + diag[row - 1] + 2;
3514: v3 = a->a + diag[row - 2] + 3;
3515: v4 = a->a + diag[row - 3] + 4;
3516: v5 = a->a + diag[row - 4] + 5;
3517: for (n = 0; n < sz - 1; n += 2) {
3518: i1 = idx[0];
3519: i2 = idx[1];
3520: idx += 2;
3521: tmp0 = x[i1];
3522: tmp1 = x[i2];
3523: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3524: v1 += 2;
3525: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3526: v2 += 2;
3527: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3528: v3 += 2;
3529: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3530: v4 += 2;
3531: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3532: v5 += 2;
3533: }
3535: if (n == sz - 1) {
3536: tmp0 = x[*idx];
3537: sum1 -= *v1 * tmp0;
3538: sum2 -= *v2 * tmp0;
3539: sum3 -= *v3 * tmp0;
3540: sum4 -= *v4 * tmp0;
3541: sum5 -= *v5 * tmp0;
3542: }
3543: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3544: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3545: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3546: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3547: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3548: row -= 5;
3549: break;
3550: default:
3551: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3552: }
3553: }
3554: PetscCall(PetscLogFlops(a->nz));
3556: /*
3557: t = b - D x where D is the block diagonal
3558: */
3559: cnt = 0;
3560: for (i = 0, row = 0; i < m; i++) {
3561: nodesz = sizes[i + 1] - sizes[i];
3562: switch (nodesz) {
3563: case 1:
3564: t[row] = b[row] - bdiag[cnt++] * x[row];
3565: row++;
3566: break;
3567: case 2:
3568: x1 = x[row];
3569: x2 = x[row + 1];
3570: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3571: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3572: t[row] = b[row] - tmp1;
3573: t[row + 1] = b[row + 1] - tmp2;
3574: row += 2;
3575: cnt += 4;
3576: break;
3577: case 3:
3578: x1 = x[row];
3579: x2 = x[row + 1];
3580: x3 = x[row + 2];
3581: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3582: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3583: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3584: t[row] = b[row] - tmp1;
3585: t[row + 1] = b[row + 1] - tmp2;
3586: t[row + 2] = b[row + 2] - tmp3;
3587: row += 3;
3588: cnt += 9;
3589: break;
3590: case 4:
3591: x1 = x[row];
3592: x2 = x[row + 1];
3593: x3 = x[row + 2];
3594: x4 = x[row + 3];
3595: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3596: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3597: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3598: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3599: t[row] = b[row] - tmp1;
3600: t[row + 1] = b[row + 1] - tmp2;
3601: t[row + 2] = b[row + 2] - tmp3;
3602: t[row + 3] = b[row + 3] - tmp4;
3603: row += 4;
3604: cnt += 16;
3605: break;
3606: case 5:
3607: x1 = x[row];
3608: x2 = x[row + 1];
3609: x3 = x[row + 2];
3610: x4 = x[row + 3];
3611: x5 = x[row + 4];
3612: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3613: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3614: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3615: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3616: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3617: t[row] = b[row] - tmp1;
3618: t[row + 1] = b[row + 1] - tmp2;
3619: t[row + 2] = b[row + 2] - tmp3;
3620: t[row + 3] = b[row + 3] - tmp4;
3621: t[row + 4] = b[row + 4] - tmp5;
3622: row += 5;
3623: cnt += 25;
3624: break;
3625: default:
3626: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3627: }
3628: }
3629: PetscCall(PetscLogFlops(m));
3631: /*
3632: Apply (L + D)^-1 where D is the block diagonal
3633: */
3634: for (i = 0, row = 0; i < m; i++) {
3635: nodesz = sizes[i + 1] - sizes[i];
3636: sz = diag[row] - ii[row];
3637: v1 = a->a + ii[row];
3638: idx = a->j + ii[row];
3639: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3640: switch (nodesz) {
3641: case 1:
3643: sum1 = t[row];
3644: for (n = 0; n < sz - 1; n += 2) {
3645: i1 = idx[0];
3646: i2 = idx[1];
3647: idx += 2;
3648: tmp0 = t[i1];
3649: tmp1 = t[i2];
3650: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3651: v1 += 2;
3652: }
3654: if (n == sz - 1) {
3655: tmp0 = t[*idx];
3656: sum1 -= *v1 * tmp0;
3657: }
3658: x[row] += t[row] = sum1 * (*ibdiag++);
3659: row++;
3660: break;
3661: case 2:
3662: v2 = a->a + ii[row + 1];
3663: sum1 = t[row];
3664: sum2 = t[row + 1];
3665: for (n = 0; n < sz - 1; n += 2) {
3666: i1 = idx[0];
3667: i2 = idx[1];
3668: idx += 2;
3669: tmp0 = t[i1];
3670: tmp1 = t[i2];
3671: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3672: v1 += 2;
3673: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3674: v2 += 2;
3675: }
3677: if (n == sz - 1) {
3678: tmp0 = t[*idx];
3679: sum1 -= v1[0] * tmp0;
3680: sum2 -= v2[0] * tmp0;
3681: }
3682: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3683: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3684: ibdiag += 4;
3685: row += 2;
3686: break;
3687: case 3:
3688: v2 = a->a + ii[row + 1];
3689: v3 = a->a + ii[row + 2];
3690: sum1 = t[row];
3691: sum2 = t[row + 1];
3692: sum3 = t[row + 2];
3693: for (n = 0; n < sz - 1; n += 2) {
3694: i1 = idx[0];
3695: i2 = idx[1];
3696: idx += 2;
3697: tmp0 = t[i1];
3698: tmp1 = t[i2];
3699: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3700: v1 += 2;
3701: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3702: v2 += 2;
3703: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3704: v3 += 2;
3705: }
3707: if (n == sz - 1) {
3708: tmp0 = t[*idx];
3709: sum1 -= v1[0] * tmp0;
3710: sum2 -= v2[0] * tmp0;
3711: sum3 -= v3[0] * tmp0;
3712: }
3713: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3714: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3715: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3716: ibdiag += 9;
3717: row += 3;
3718: break;
3719: case 4:
3720: v2 = a->a + ii[row + 1];
3721: v3 = a->a + ii[row + 2];
3722: v4 = a->a + ii[row + 3];
3723: sum1 = t[row];
3724: sum2 = t[row + 1];
3725: sum3 = t[row + 2];
3726: sum4 = t[row + 3];
3727: for (n = 0; n < sz - 1; n += 2) {
3728: i1 = idx[0];
3729: i2 = idx[1];
3730: idx += 2;
3731: tmp0 = t[i1];
3732: tmp1 = t[i2];
3733: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3734: v1 += 2;
3735: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3736: v2 += 2;
3737: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3738: v3 += 2;
3739: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3740: v4 += 2;
3741: }
3743: if (n == sz - 1) {
3744: tmp0 = t[*idx];
3745: sum1 -= v1[0] * tmp0;
3746: sum2 -= v2[0] * tmp0;
3747: sum3 -= v3[0] * tmp0;
3748: sum4 -= v4[0] * tmp0;
3749: }
3750: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3751: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3752: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3753: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3754: ibdiag += 16;
3755: row += 4;
3756: break;
3757: case 5:
3758: v2 = a->a + ii[row + 1];
3759: v3 = a->a + ii[row + 2];
3760: v4 = a->a + ii[row + 3];
3761: v5 = a->a + ii[row + 4];
3762: sum1 = t[row];
3763: sum2 = t[row + 1];
3764: sum3 = t[row + 2];
3765: sum4 = t[row + 3];
3766: sum5 = t[row + 4];
3767: for (n = 0; n < sz - 1; n += 2) {
3768: i1 = idx[0];
3769: i2 = idx[1];
3770: idx += 2;
3771: tmp0 = t[i1];
3772: tmp1 = t[i2];
3773: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3774: v1 += 2;
3775: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3776: v2 += 2;
3777: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3778: v3 += 2;
3779: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3780: v4 += 2;
3781: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3782: v5 += 2;
3783: }
3785: if (n == sz - 1) {
3786: tmp0 = t[*idx];
3787: sum1 -= v1[0] * tmp0;
3788: sum2 -= v2[0] * tmp0;
3789: sum3 -= v3[0] * tmp0;
3790: sum4 -= v4[0] * tmp0;
3791: sum5 -= v5[0] * tmp0;
3792: }
3793: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3794: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3795: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3796: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3797: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3798: ibdiag += 25;
3799: row += 5;
3800: break;
3801: default:
3802: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3803: }
3804: }
3805: PetscCall(PetscLogFlops(a->nz));
3806: }
3807: PetscCall(VecRestoreArray(xx, &x));
3808: PetscCall(VecRestoreArrayRead(bb, &b));
3809: PetscFunctionReturn(PETSC_SUCCESS);
3810: }
3812: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
3813: {
3814: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3815: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
3816: const MatScalar *bdiag = a->inode.bdiag;
3817: const PetscScalar *b;
3818: PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
3819: const PetscInt *sizes = a->inode.size_csr;
3821: PetscFunctionBegin;
3822: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3823: PetscCall(VecGetArray(xx, &x));
3824: PetscCall(VecGetArrayRead(bb, &b));
3825: cnt = 0;
3826: for (i = 0, row = 0; i < m; i++) {
3827: nodesz = sizes[i + 1] - sizes[i];
3828: switch (nodesz) {
3829: case 1:
3830: x[row] = b[row] * bdiag[cnt++];
3831: row++;
3832: break;
3833: case 2:
3834: x1 = b[row];
3835: x2 = b[row + 1];
3836: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3837: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3838: x[row++] = tmp1;
3839: x[row++] = tmp2;
3840: cnt += 4;
3841: break;
3842: case 3:
3843: x1 = b[row];
3844: x2 = b[row + 1];
3845: x3 = b[row + 2];
3846: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3847: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3848: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3849: x[row++] = tmp1;
3850: x[row++] = tmp2;
3851: x[row++] = tmp3;
3852: cnt += 9;
3853: break;
3854: case 4:
3855: x1 = b[row];
3856: x2 = b[row + 1];
3857: x3 = b[row + 2];
3858: x4 = b[row + 3];
3859: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3860: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3861: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3862: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3863: x[row++] = tmp1;
3864: x[row++] = tmp2;
3865: x[row++] = tmp3;
3866: x[row++] = tmp4;
3867: cnt += 16;
3868: break;
3869: case 5:
3870: x1 = b[row];
3871: x2 = b[row + 1];
3872: x3 = b[row + 2];
3873: x4 = b[row + 3];
3874: x5 = b[row + 4];
3875: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3876: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3877: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3878: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3879: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3880: x[row++] = tmp1;
3881: x[row++] = tmp2;
3882: x[row++] = tmp3;
3883: x[row++] = tmp4;
3884: x[row++] = tmp5;
3885: cnt += 25;
3886: break;
3887: default:
3888: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3889: }
3890: }
3891: PetscCall(PetscLogFlops(2.0 * cnt));
3892: PetscCall(VecRestoreArray(xx, &x));
3893: PetscCall(VecRestoreArrayRead(bb, &b));
3894: PetscFunctionReturn(PETSC_SUCCESS);
3895: }
3897: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
3898: {
3899: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3901: PetscFunctionBegin;
3902: a->inode.node_count = 0;
3903: a->inode.use = PETSC_FALSE;
3904: a->inode.checked = PETSC_FALSE;
3905: a->inode.mat_nonzerostate = -1;
3906: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
3907: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
3908: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
3909: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
3910: A->ops->coloringpatch = NULL;
3911: A->ops->multdiagonalblock = NULL;
3912: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
3913: PetscFunctionReturn(PETSC_SUCCESS);
3914: }
3916: /*
3917: samestructure indicates that the matrix has not changed its nonzero structure so we
3918: do not need to recompute the inodes
3919: */
3920: PetscErrorCode MatSeqAIJCheckInode(Mat A)
3921: {
3922: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3923: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
3924: PetscBool flag;
3925: const PetscInt *idx, *idy, *ii;
3927: PetscFunctionBegin;
3928: if (!a->inode.use) {
3929: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3930: PetscCall(PetscFree(a->inode.size_csr));
3931: PetscFunctionReturn(PETSC_SUCCESS);
3932: }
3933: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
3935: m = A->rmap->n;
3936: if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
3937: ns = a->inode.size_csr;
3938: ns[0] = 0;
3940: i = 0;
3941: node_count = 0;
3942: idx = a->j;
3943: ii = a->i;
3944: if (idx) {
3945: while (i < m) { /* For each row */
3946: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
3947: /* Limits the number of elements in a node to 'a->inode.limit' */
3948: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
3949: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
3950: if (nzy != nzx) break;
3951: idy += nzx; /* Same nonzero pattern */
3952: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
3953: if (!flag) break;
3954: }
3955: ns[node_count + 1] = ns[node_count] + blk_size;
3956: node_count++;
3957: idx += blk_size * nzx;
3958: i = j;
3959: }
3960: }
3961: /* If not enough inodes found,, do not use inode version of the routines */
3962: if (!m || !idx || node_count > .8 * m) {
3963: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3964: PetscCall(PetscFree(a->inode.size_csr));
3965: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
3966: } else {
3967: if (!A->factortype) {
3968: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3969: if (A->rmap->n == A->cmap->n) {
3970: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
3971: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
3972: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
3973: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
3974: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
3975: }
3976: } else {
3977: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3978: }
3979: a->inode.node_count = node_count;
3980: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
3981: }
3982: a->inode.checked = PETSC_TRUE;
3983: a->inode.mat_nonzerostate = A->nonzerostate;
3984: PetscFunctionReturn(PETSC_SUCCESS);
3985: }
3987: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
3988: {
3989: Mat B = *C;
3990: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
3991: PetscInt m = A->rmap->n;
3993: PetscFunctionBegin;
3994: c->inode.use = a->inode.use;
3995: c->inode.limit = a->inode.limit;
3996: c->inode.max_limit = a->inode.max_limit;
3997: c->inode.checked = PETSC_FALSE;
3998: c->inode.size_csr = NULL;
3999: c->inode.node_count = 0;
4000: c->inode.ibdiag = NULL;
4001: c->inode.bdiag = NULL;
4002: c->inode.mat_nonzerostate = -1;
4003: if (a->inode.use) {
4004: if (a->inode.checked && a->inode.size_csr) {
4005: PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4006: PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4008: c->inode.checked = PETSC_TRUE;
4009: c->inode.node_count = a->inode.node_count;
4010: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4011: }
4012: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4013: if (!B->factortype) {
4014: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4015: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4016: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4017: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4018: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4019: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4020: } else {
4021: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4022: }
4023: }
4024: PetscFunctionReturn(PETSC_SUCCESS);
4025: }
4027: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4028: {
4029: PetscInt k;
4030: const PetscInt *vi;
4032: PetscFunctionBegin;
4033: vi = aj + ai[row];
4034: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4035: vi = aj + adiag[row];
4036: cols[nzl] = vi[0];
4037: vi = aj + adiag[row + 1] + 1;
4038: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4039: PetscFunctionReturn(PETSC_SUCCESS);
4040: }
4041: /*
4042: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4043: Modified from MatSeqAIJCheckInode().
4045: Input Parameters:
4046: . Mat A - ILU or LU matrix factor
4048: */
4049: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4050: {
4051: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4052: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4053: PetscInt *cols1, *cols2, *ns;
4054: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4055: PetscBool flag;
4057: PetscFunctionBegin;
4058: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4059: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4061: m = A->rmap->n;
4062: if (a->inode.size_csr) ns = a->inode.size_csr;
4063: else PetscCall(PetscMalloc1(m + 1, &ns));
4064: ns[0] = 0;
4066: i = 0;
4067: node_count = 0;
4068: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4069: while (i < m) { /* For each row */
4070: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4071: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4072: nzx = nzl1 + nzu1 + 1;
4073: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4075: /* Limits the number of elements in a node to 'a->inode.limit' */
4076: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4077: nzl2 = ai[j + 1] - ai[j];
4078: nzu2 = adiag[j] - adiag[j + 1] - 1;
4079: nzy = nzl2 + nzu2 + 1;
4080: if (nzy != nzx) break;
4081: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4082: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4083: if (!flag) break;
4084: }
4085: ns[node_count + 1] = ns[node_count] + blk_size;
4086: node_count++;
4087: i = j;
4088: }
4089: PetscCall(PetscFree2(cols1, cols2));
4090: /* If not enough inodes found,, do not use inode version of the routines */
4091: if (!m || node_count > .8 * m) {
4092: PetscCall(PetscFree(ns));
4094: a->inode.node_count = 0;
4095: a->inode.size_csr = NULL;
4096: a->inode.use = PETSC_FALSE;
4098: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4099: } else {
4100: A->ops->mult = NULL;
4101: A->ops->sor = NULL;
4102: A->ops->multadd = NULL;
4103: A->ops->getrowij = NULL;
4104: A->ops->restorerowij = NULL;
4105: A->ops->getcolumnij = NULL;
4106: A->ops->restorecolumnij = NULL;
4107: A->ops->coloringpatch = NULL;
4108: A->ops->multdiagonalblock = NULL;
4109: a->inode.node_count = node_count;
4110: a->inode.size_csr = ns;
4111: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4112: }
4113: a->inode.checked = PETSC_TRUE;
4114: PetscFunctionReturn(PETSC_SUCCESS);
4115: }
4117: /*
4118: This is really ugly. if inodes are used this replaces the
4119: permutations with ones that correspond to rows/cols of the matrix
4120: rather than inode blocks
4121: */
4122: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4123: {
4124: PetscFunctionBegin;
4125: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4126: PetscFunctionReturn(PETSC_SUCCESS);
4127: }
4129: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4130: {
4131: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4132: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4133: const PetscInt *ridx, *cidx;
4134: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4135: PetscInt nslim_col, *ns_col;
4136: IS ris = *rperm, cis = *cperm;
4138: PetscFunctionBegin;
4139: if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4140: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4142: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4143: PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4144: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4146: PetscCall(ISGetIndices(ris, &ridx));
4147: PetscCall(ISGetIndices(cis, &cidx));
4149: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4150: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4152: /* Construct the permutations for rows*/
4153: for (i = 0, row = 0; i < nslim_row; ++i) {
4154: indx = ridx[i];
4155: start_val = tns[indx];
4156: end_val = tns[indx + 1];
4157: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4158: }
4160: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4161: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4163: /* Construct permutations for columns */
4164: for (i = 0, col = 0; i < nslim_col; ++i) {
4165: indx = cidx[i];
4166: start_val = tns[indx];
4167: end_val = tns[indx + 1];
4168: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4169: }
4171: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4172: PetscCall(ISSetPermutation(*rperm));
4173: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4174: PetscCall(ISSetPermutation(*cperm));
4176: PetscCall(ISRestoreIndices(ris, &ridx));
4177: PetscCall(ISRestoreIndices(cis, &cidx));
4179: PetscCall(PetscFree(ns_col));
4180: PetscCall(PetscFree2(permr, permc));
4181: PetscCall(ISDestroy(&cis));
4182: PetscCall(ISDestroy(&ris));
4183: PetscCall(PetscFree(tns));
4184: PetscFunctionReturn(PETSC_SUCCESS);
4185: }
4187: /*@C
4188: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4190: Not Collective
4192: Input Parameter:
4193: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4195: Output Parameters:
4196: + node_count - no of inodes present in the matrix.
4197: . sizes - an array of size `node_count`, with the sizes of each inode.
4198: - limit - the max size used to generate the inodes.
4200: Level: advanced
4202: Note:
4203: It should be called after the matrix is assembled.
4204: The contents of the sizes[] array should not be changed.
4205: `NULL` may be passed for information not needed
4207: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4208: @*/
4209: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4210: {
4211: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4213: PetscFunctionBegin;
4214: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4215: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4216: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4217: PetscFunctionReturn(PETSC_SUCCESS);
4218: }
4220: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4221: {
4222: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4224: PetscFunctionBegin;
4225: if (node_count) *node_count = a->inode.node_count;
4226: if (sizes) *sizes = a->inode.size_csr;
4227: if (limit) *limit = a->inode.limit;
4228: PetscFunctionReturn(PETSC_SUCCESS);
4229: }