Actual source code: baijfact13.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 3 by 3
9: */
10: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic;
15: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17: const PetscInt *diag_offset;
18: PetscInt idx, *pj;
19: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
20: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
22: MatScalar *ba = b->a, *aa = a->a;
23: PetscReal shift = info->shiftamount;
24: PetscBool allowzeropivot, zeropivotdetected;
26: PetscFunctionBegin;
27: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
28: A->factortype = MAT_FACTOR_NONE;
29: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
30: A->factortype = MAT_FACTOR_ILU;
31: PetscCall(ISGetIndices(isrow, &r));
32: PetscCall(ISGetIndices(isicol, &ic));
33: PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
34: allowzeropivot = PetscNot(A->erroriffailure);
36: for (i = 0; i < n; i++) {
37: nz = bi[i + 1] - bi[i];
38: ajtmp = bj + bi[i];
39: for (j = 0; j < nz; j++) {
40: x = rtmp + 9 * ajtmp[j];
41: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
42: }
43: /* load in initial (unfactored row) */
44: idx = r[i];
45: nz = ai[idx + 1] - ai[idx];
46: ajtmpold = aj + ai[idx];
47: v = aa + 9 * ai[idx];
48: for (j = 0; j < nz; j++) {
49: x = rtmp + 9 * ic[ajtmpold[j]];
50: x[0] = v[0];
51: x[1] = v[1];
52: x[2] = v[2];
53: x[3] = v[3];
54: x[4] = v[4];
55: x[5] = v[5];
56: x[6] = v[6];
57: x[7] = v[7];
58: x[8] = v[8];
59: v += 9;
60: }
61: row = *ajtmp++;
62: while (row < i) {
63: pc = rtmp + 9 * row;
64: p1 = pc[0];
65: p2 = pc[1];
66: p3 = pc[2];
67: p4 = pc[3];
68: p5 = pc[4];
69: p6 = pc[5];
70: p7 = pc[6];
71: p8 = pc[7];
72: p9 = pc[8];
73: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
74: pv = ba + 9 * diag_offset[row];
75: pj = bj + diag_offset[row] + 1;
76: x1 = pv[0];
77: x2 = pv[1];
78: x3 = pv[2];
79: x4 = pv[3];
80: x5 = pv[4];
81: x6 = pv[5];
82: x7 = pv[6];
83: x8 = pv[7];
84: x9 = pv[8];
85: pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
86: pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
87: pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
89: pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
90: pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
91: pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
93: pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
94: pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
95: pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
96: nz = bi[row + 1] - diag_offset[row] - 1;
97: pv += 9;
98: for (j = 0; j < nz; j++) {
99: x1 = pv[0];
100: x2 = pv[1];
101: x3 = pv[2];
102: x4 = pv[3];
103: x5 = pv[4];
104: x6 = pv[5];
105: x7 = pv[6];
106: x8 = pv[7];
107: x9 = pv[8];
108: x = rtmp + 9 * pj[j];
109: x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
110: x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
111: x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
113: x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
114: x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
115: x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
117: x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
118: x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
119: x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
120: pv += 9;
121: }
122: PetscCall(PetscLogFlops(54.0 * nz + 36.0));
123: }
124: row = *ajtmp++;
125: }
126: /* finished row so stick it into b->a */
127: pv = ba + 9 * bi[i];
128: pj = bj + bi[i];
129: nz = bi[i + 1] - bi[i];
130: for (j = 0; j < nz; j++) {
131: x = rtmp + 9 * pj[j];
132: pv[0] = x[0];
133: pv[1] = x[1];
134: pv[2] = x[2];
135: pv[3] = x[3];
136: pv[4] = x[4];
137: pv[5] = x[5];
138: pv[6] = x[6];
139: pv[7] = x[7];
140: pv[8] = x[8];
141: pv += 9;
142: }
143: /* invert diagonal block */
144: w = ba + 9 * diag_offset[i];
145: PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
146: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
147: }
149: PetscCall(PetscFree(rtmp));
150: PetscCall(ISRestoreIndices(isicol, &ic));
151: PetscCall(ISRestoreIndices(isrow, &r));
153: C->ops->solve = MatSolve_SeqBAIJ_3_inplace;
154: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
155: C->assembled = PETSC_TRUE;
157: PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
158: PetscFunctionReturn(PETSC_SUCCESS);
159: }
161: /* MatLUFactorNumeric_SeqBAIJ_3 -
162: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163: PetscKernel_A_gets_A_times_B()
164: PetscKernel_A_gets_A_minus_B_times_C()
165: PetscKernel_A_gets_inverse_A()
166: */
167: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
168: {
169: Mat C = B;
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
171: IS isrow = b->row, isicol = b->icol;
172: const PetscInt *r, *ic;
173: PetscInt i, j, k, nz, nzL, row;
174: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
175: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
176: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
177: PetscInt flg;
178: PetscReal shift = info->shiftamount;
179: PetscBool allowzeropivot, zeropivotdetected;
181: PetscFunctionBegin;
182: PetscCall(ISGetIndices(isrow, &r));
183: PetscCall(ISGetIndices(isicol, &ic));
184: allowzeropivot = PetscNot(A->erroriffailure);
186: /* generate work space needed by the factorization */
187: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
188: PetscCall(PetscArrayzero(rtmp, bs2 * n));
190: for (i = 0; i < n; i++) {
191: /* zero rtmp */
192: /* L part */
193: nz = bi[i + 1] - bi[i];
194: bjtmp = bj + bi[i];
195: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
197: /* U part */
198: nz = bdiag[i] - bdiag[i + 1];
199: bjtmp = bj + bdiag[i + 1] + 1;
200: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
202: /* load in initial (unfactored row) */
203: nz = ai[r[i] + 1] - ai[r[i]];
204: ajtmp = aj + ai[r[i]];
205: v = aa + bs2 * ai[r[i]];
206: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
208: /* elimination */
209: bjtmp = bj + bi[i];
210: nzL = bi[i + 1] - bi[i];
211: for (k = 0; k < nzL; k++) {
212: row = bjtmp[k];
213: pc = rtmp + bs2 * row;
214: for (flg = 0, j = 0; j < bs2; j++) {
215: if (pc[j] != 0.0) {
216: flg = 1;
217: break;
218: }
219: }
220: if (flg) {
221: pv = b->a + bs2 * bdiag[row];
222: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
223: PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
225: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
226: pv = b->a + bs2 * (bdiag[row + 1] + 1);
227: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
228: for (j = 0; j < nz; j++) {
229: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
230: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
231: v = rtmp + bs2 * pj[j];
232: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
233: pv += bs2;
234: }
235: PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
236: }
237: }
239: /* finished row so stick it into b->a */
240: /* L part */
241: pv = b->a + bs2 * bi[i];
242: pj = b->j + bi[i];
243: nz = bi[i + 1] - bi[i];
244: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
246: /* Mark diagonal and invert diagonal for simpler triangular solves */
247: pv = b->a + bs2 * bdiag[i];
248: pj = b->j + bdiag[i];
249: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
250: PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
251: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
253: /* U part */
254: pj = b->j + bdiag[i + 1] + 1;
255: pv = b->a + bs2 * (bdiag[i + 1] + 1);
256: nz = bdiag[i] - bdiag[i + 1] - 1;
257: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
258: }
260: PetscCall(PetscFree2(rtmp, mwork));
261: PetscCall(ISRestoreIndices(isicol, &ic));
262: PetscCall(ISRestoreIndices(isrow, &r));
264: C->ops->solve = MatSolve_SeqBAIJ_3;
265: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
266: C->assembled = PETSC_TRUE;
268: PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
269: PetscFunctionReturn(PETSC_SUCCESS);
270: }
272: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
273: {
274: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
275: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
276: PetscInt *ajtmpold, *ajtmp, nz, row;
277: const PetscInt *diag_offset;
278: PetscInt *ai = a->i, *aj = a->j, *pj;
279: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
280: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
281: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
282: MatScalar *ba = b->a, *aa = a->a;
283: PetscReal shift = info->shiftamount;
284: PetscBool allowzeropivot, zeropivotdetected;
286: PetscFunctionBegin;
287: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
288: A->factortype = MAT_FACTOR_NONE;
289: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
290: A->factortype = MAT_FACTOR_ILU;
291: PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
292: allowzeropivot = PetscNot(A->erroriffailure);
294: for (i = 0; i < n; i++) {
295: nz = bi[i + 1] - bi[i];
296: ajtmp = bj + bi[i];
297: for (j = 0; j < nz; j++) {
298: x = rtmp + 9 * ajtmp[j];
299: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
300: }
301: /* load in initial (unfactored row) */
302: nz = ai[i + 1] - ai[i];
303: ajtmpold = aj + ai[i];
304: v = aa + 9 * ai[i];
305: for (j = 0; j < nz; j++) {
306: x = rtmp + 9 * ajtmpold[j];
307: x[0] = v[0];
308: x[1] = v[1];
309: x[2] = v[2];
310: x[3] = v[3];
311: x[4] = v[4];
312: x[5] = v[5];
313: x[6] = v[6];
314: x[7] = v[7];
315: x[8] = v[8];
316: v += 9;
317: }
318: row = *ajtmp++;
319: while (row < i) {
320: pc = rtmp + 9 * row;
321: p1 = pc[0];
322: p2 = pc[1];
323: p3 = pc[2];
324: p4 = pc[3];
325: p5 = pc[4];
326: p6 = pc[5];
327: p7 = pc[6];
328: p8 = pc[7];
329: p9 = pc[8];
330: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
331: pv = ba + 9 * diag_offset[row];
332: pj = bj + diag_offset[row] + 1;
333: x1 = pv[0];
334: x2 = pv[1];
335: x3 = pv[2];
336: x4 = pv[3];
337: x5 = pv[4];
338: x6 = pv[5];
339: x7 = pv[6];
340: x8 = pv[7];
341: x9 = pv[8];
342: pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
343: pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
344: pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
346: pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
347: pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
348: pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
350: pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
351: pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
352: pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
354: nz = bi[row + 1] - diag_offset[row] - 1;
355: pv += 9;
356: for (j = 0; j < nz; j++) {
357: x1 = pv[0];
358: x2 = pv[1];
359: x3 = pv[2];
360: x4 = pv[3];
361: x5 = pv[4];
362: x6 = pv[5];
363: x7 = pv[6];
364: x8 = pv[7];
365: x9 = pv[8];
366: x = rtmp + 9 * pj[j];
367: x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
368: x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
369: x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
371: x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
372: x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
373: x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
375: x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
376: x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
377: x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
378: pv += 9;
379: }
380: PetscCall(PetscLogFlops(54.0 * nz + 36.0));
381: }
382: row = *ajtmp++;
383: }
384: /* finished row so stick it into b->a */
385: pv = ba + 9 * bi[i];
386: pj = bj + bi[i];
387: nz = bi[i + 1] - bi[i];
388: for (j = 0; j < nz; j++) {
389: x = rtmp + 9 * pj[j];
390: pv[0] = x[0];
391: pv[1] = x[1];
392: pv[2] = x[2];
393: pv[3] = x[3];
394: pv[4] = x[4];
395: pv[5] = x[5];
396: pv[6] = x[6];
397: pv[7] = x[7];
398: pv[8] = x[8];
399: pv += 9;
400: }
401: /* invert diagonal block */
402: w = ba + 9 * diag_offset[i];
403: PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
404: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
405: }
407: PetscCall(PetscFree(rtmp));
409: C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
410: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
411: C->assembled = PETSC_TRUE;
413: PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
414: PetscFunctionReturn(PETSC_SUCCESS);
415: }
417: /*
418: MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
419: copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
420: */
421: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
422: {
423: Mat C = B;
424: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
425: PetscInt i, j, k, nz, nzL, row;
426: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
427: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
428: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
429: PetscInt flg;
430: PetscReal shift = info->shiftamount;
431: PetscBool allowzeropivot, zeropivotdetected;
433: PetscFunctionBegin;
434: allowzeropivot = PetscNot(A->erroriffailure);
436: /* generate work space needed by the factorization */
437: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
438: PetscCall(PetscArrayzero(rtmp, bs2 * n));
440: for (i = 0; i < n; i++) {
441: /* zero rtmp */
442: /* L part */
443: nz = bi[i + 1] - bi[i];
444: bjtmp = bj + bi[i];
445: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
447: /* U part */
448: nz = bdiag[i] - bdiag[i + 1];
449: bjtmp = bj + bdiag[i + 1] + 1;
450: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
452: /* load in initial (unfactored row) */
453: nz = ai[i + 1] - ai[i];
454: ajtmp = aj + ai[i];
455: v = aa + bs2 * ai[i];
456: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
458: /* elimination */
459: bjtmp = bj + bi[i];
460: nzL = bi[i + 1] - bi[i];
461: for (k = 0; k < nzL; k++) {
462: row = bjtmp[k];
463: pc = rtmp + bs2 * row;
464: for (flg = 0, j = 0; j < bs2; j++) {
465: if (pc[j] != 0.0) {
466: flg = 1;
467: break;
468: }
469: }
470: if (flg) {
471: pv = b->a + bs2 * bdiag[row];
472: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
473: PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
475: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
476: pv = b->a + bs2 * (bdiag[row + 1] + 1);
477: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
478: for (j = 0; j < nz; j++) {
479: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
480: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
481: v = rtmp + bs2 * pj[j];
482: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
483: pv += bs2;
484: }
485: PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
486: }
487: }
489: /* finished row so stick it into b->a */
490: /* L part */
491: pv = b->a + bs2 * bi[i];
492: pj = b->j + bi[i];
493: nz = bi[i + 1] - bi[i];
494: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
496: /* Mark diagonal and invert diagonal for simpler triangular solves */
497: pv = b->a + bs2 * bdiag[i];
498: pj = b->j + bdiag[i];
499: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
500: PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
501: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
503: /* U part */
504: pv = b->a + bs2 * (bdiag[i + 1] + 1);
505: pj = b->j + bdiag[i + 1] + 1;
506: nz = bdiag[i] - bdiag[i + 1] - 1;
507: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
508: }
509: PetscCall(PetscFree2(rtmp, mwork));
511: C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
512: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
513: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
514: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
515: C->assembled = PETSC_TRUE;
517: PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
518: PetscFunctionReturn(PETSC_SUCCESS);
519: }