Actual source code: baijfact11.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 4 by 4
9: */
10: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic;
15: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16: PetscInt *ajtmpold, *ajtmp, nz, row;
17: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
18: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
19: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
21: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
22: MatScalar m13, m14, m15, m16;
23: MatScalar *ba = b->a, *aa = a->a;
24: PetscBool pivotinblocks = b->pivotinblocks;
25: PetscReal shift = info->shiftamount;
26: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
28: PetscFunctionBegin;
29: PetscCall(ISGetIndices(isrow, &r));
30: PetscCall(ISGetIndices(isicol, &ic));
31: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
32: allowzeropivot = PetscNot(A->erroriffailure);
34: for (i = 0; i < n; i++) {
35: nz = bi[i + 1] - bi[i];
36: ajtmp = bj + bi[i];
37: for (j = 0; j < nz; j++) {
38: x = rtmp + 16 * ajtmp[j];
39: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
40: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
41: }
42: /* load in initial (unfactored row) */
43: idx = r[i];
44: nz = ai[idx + 1] - ai[idx];
45: ajtmpold = aj + ai[idx];
46: v = aa + 16 * ai[idx];
47: for (j = 0; j < nz; j++) {
48: x = rtmp + 16 * ic[ajtmpold[j]];
49: x[0] = v[0];
50: x[1] = v[1];
51: x[2] = v[2];
52: x[3] = v[3];
53: x[4] = v[4];
54: x[5] = v[5];
55: x[6] = v[6];
56: x[7] = v[7];
57: x[8] = v[8];
58: x[9] = v[9];
59: x[10] = v[10];
60: x[11] = v[11];
61: x[12] = v[12];
62: x[13] = v[13];
63: x[14] = v[14];
64: x[15] = v[15];
65: v += 16;
66: }
67: row = *ajtmp++;
68: while (row < i) {
69: pc = rtmp + 16 * row;
70: p1 = pc[0];
71: p2 = pc[1];
72: p3 = pc[2];
73: p4 = pc[3];
74: p5 = pc[4];
75: p6 = pc[5];
76: p7 = pc[6];
77: p8 = pc[7];
78: p9 = pc[8];
79: p10 = pc[9];
80: p11 = pc[10];
81: p12 = pc[11];
82: p13 = pc[12];
83: p14 = pc[13];
84: p15 = pc[14];
85: p16 = pc[15];
86: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
87: pv = ba + 16 * diag_offset[row];
88: pj = bj + diag_offset[row] + 1;
89: x1 = pv[0];
90: x2 = pv[1];
91: x3 = pv[2];
92: x4 = pv[3];
93: x5 = pv[4];
94: x6 = pv[5];
95: x7 = pv[6];
96: x8 = pv[7];
97: x9 = pv[8];
98: x10 = pv[9];
99: x11 = pv[10];
100: x12 = pv[11];
101: x13 = pv[12];
102: x14 = pv[13];
103: x15 = pv[14];
104: x16 = pv[15];
105: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
106: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
107: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
108: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
110: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
111: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
112: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
113: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
115: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
116: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
117: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
118: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
120: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
121: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
122: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
123: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
125: nz = bi[row + 1] - diag_offset[row] - 1;
126: pv += 16;
127: for (j = 0; j < nz; j++) {
128: x1 = pv[0];
129: x2 = pv[1];
130: x3 = pv[2];
131: x4 = pv[3];
132: x5 = pv[4];
133: x6 = pv[5];
134: x7 = pv[6];
135: x8 = pv[7];
136: x9 = pv[8];
137: x10 = pv[9];
138: x11 = pv[10];
139: x12 = pv[11];
140: x13 = pv[12];
141: x14 = pv[13];
142: x15 = pv[14];
143: x16 = pv[15];
144: x = rtmp + 16 * pj[j];
145: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
146: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
147: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
148: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
150: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
151: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
152: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
153: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
155: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
156: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
157: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
158: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
160: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
161: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
162: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
163: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
165: pv += 16;
166: }
167: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
168: }
169: row = *ajtmp++;
170: }
171: /* finished row so stick it into b->a */
172: pv = ba + 16 * bi[i];
173: pj = bj + bi[i];
174: nz = bi[i + 1] - bi[i];
175: for (j = 0; j < nz; j++) {
176: x = rtmp + 16 * pj[j];
177: pv[0] = x[0];
178: pv[1] = x[1];
179: pv[2] = x[2];
180: pv[3] = x[3];
181: pv[4] = x[4];
182: pv[5] = x[5];
183: pv[6] = x[6];
184: pv[7] = x[7];
185: pv[8] = x[8];
186: pv[9] = x[9];
187: pv[10] = x[10];
188: pv[11] = x[11];
189: pv[12] = x[12];
190: pv[13] = x[13];
191: pv[14] = x[14];
192: pv[15] = x[15];
193: pv += 16;
194: }
195: /* invert diagonal block */
196: w = ba + 16 * diag_offset[i];
197: if (pivotinblocks) {
198: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
199: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200: } else {
201: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
202: }
203: }
205: PetscCall(PetscFree(rtmp));
206: PetscCall(ISRestoreIndices(isicol, &ic));
207: PetscCall(ISRestoreIndices(isrow, &r));
209: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
210: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
211: C->assembled = PETSC_TRUE;
213: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
214: PetscFunctionReturn(PETSC_SUCCESS);
215: }
217: /* MatLUFactorNumeric_SeqBAIJ_4 -
218: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
219: PetscKernel_A_gets_A_times_B()
220: PetscKernel_A_gets_A_minus_B_times_C()
221: PetscKernel_A_gets_inverse_A()
222: */
224: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
225: {
226: Mat C = B;
227: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228: IS isrow = b->row, isicol = b->icol;
229: const PetscInt *r, *ic;
230: PetscInt i, j, k, nz, nzL, row;
231: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
232: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
233: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
234: PetscInt flg;
235: PetscReal shift;
236: PetscBool allowzeropivot, zeropivotdetected;
238: PetscFunctionBegin;
239: allowzeropivot = PetscNot(A->erroriffailure);
240: PetscCall(ISGetIndices(isrow, &r));
241: PetscCall(ISGetIndices(isicol, &ic));
243: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
244: shift = 0;
245: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246: shift = info->shiftamount;
247: }
249: /* generate work space needed by the factorization */
250: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
251: PetscCall(PetscArrayzero(rtmp, bs2 * n));
253: for (i = 0; i < n; i++) {
254: /* zero rtmp */
255: /* L part */
256: nz = bi[i + 1] - bi[i];
257: bjtmp = bj + bi[i];
258: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
260: /* U part */
261: nz = bdiag[i] - bdiag[i + 1];
262: bjtmp = bj + bdiag[i + 1] + 1;
263: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
265: /* load in initial (unfactored row) */
266: nz = ai[r[i] + 1] - ai[r[i]];
267: ajtmp = aj + ai[r[i]];
268: v = aa + bs2 * ai[r[i]];
269: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
271: /* elimination */
272: bjtmp = bj + bi[i];
273: nzL = bi[i + 1] - bi[i];
274: for (k = 0; k < nzL; k++) {
275: row = bjtmp[k];
276: pc = rtmp + bs2 * row;
277: for (flg = 0, j = 0; j < bs2; j++) {
278: if (pc[j] != 0.0) {
279: flg = 1;
280: break;
281: }
282: }
283: if (flg) {
284: pv = b->a + bs2 * bdiag[row];
285: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
288: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289: pv = b->a + bs2 * (bdiag[row + 1] + 1);
290: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291: for (j = 0; j < nz; j++) {
292: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294: v = rtmp + bs2 * pj[j];
295: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
296: pv += bs2;
297: }
298: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299: }
300: }
302: /* finished row so stick it into b->a */
303: /* L part */
304: pv = b->a + bs2 * bi[i];
305: pj = b->j + bi[i];
306: nz = bi[i + 1] - bi[i];
307: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
309: /* Mark diagonal and invert diagonal for simpler triangular solves */
310: pv = b->a + bs2 * bdiag[i];
311: pj = b->j + bdiag[i];
312: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
313: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
314: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316: /* U part */
317: pv = b->a + bs2 * (bdiag[i + 1] + 1);
318: pj = b->j + bdiag[i + 1] + 1;
319: nz = bdiag[i] - bdiag[i + 1] - 1;
320: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
321: }
323: PetscCall(PetscFree2(rtmp, mwork));
324: PetscCall(ISRestoreIndices(isicol, &ic));
325: PetscCall(ISRestoreIndices(isrow, &r));
327: C->ops->solve = MatSolve_SeqBAIJ_4;
328: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329: C->assembled = PETSC_TRUE;
331: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
332: PetscFunctionReturn(PETSC_SUCCESS);
333: }
335: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336: {
337: /*
338: Default Version for when blocks are 4 by 4 Using natural ordering
339: */
340: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342: PetscInt *ajtmpold, *ajtmp, nz, row;
343: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348: MatScalar m13, m14, m15, m16;
349: MatScalar *ba = b->a, *aa = a->a;
350: PetscBool pivotinblocks = b->pivotinblocks;
351: PetscReal shift = info->shiftamount;
352: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
354: PetscFunctionBegin;
355: allowzeropivot = PetscNot(A->erroriffailure);
356: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
358: for (i = 0; i < n; i++) {
359: nz = bi[i + 1] - bi[i];
360: ajtmp = bj + bi[i];
361: for (j = 0; j < nz; j++) {
362: x = rtmp + 16 * ajtmp[j];
363: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
364: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
365: }
366: /* load in initial (unfactored row) */
367: nz = ai[i + 1] - ai[i];
368: ajtmpold = aj + ai[i];
369: v = aa + 16 * ai[i];
370: for (j = 0; j < nz; j++) {
371: x = rtmp + 16 * ajtmpold[j];
372: x[0] = v[0];
373: x[1] = v[1];
374: x[2] = v[2];
375: x[3] = v[3];
376: x[4] = v[4];
377: x[5] = v[5];
378: x[6] = v[6];
379: x[7] = v[7];
380: x[8] = v[8];
381: x[9] = v[9];
382: x[10] = v[10];
383: x[11] = v[11];
384: x[12] = v[12];
385: x[13] = v[13];
386: x[14] = v[14];
387: x[15] = v[15];
388: v += 16;
389: }
390: row = *ajtmp++;
391: while (row < i) {
392: pc = rtmp + 16 * row;
393: p1 = pc[0];
394: p2 = pc[1];
395: p3 = pc[2];
396: p4 = pc[3];
397: p5 = pc[4];
398: p6 = pc[5];
399: p7 = pc[6];
400: p8 = pc[7];
401: p9 = pc[8];
402: p10 = pc[9];
403: p11 = pc[10];
404: p12 = pc[11];
405: p13 = pc[12];
406: p14 = pc[13];
407: p15 = pc[14];
408: p16 = pc[15];
409: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
410: pv = ba + 16 * diag_offset[row];
411: pj = bj + diag_offset[row] + 1;
412: x1 = pv[0];
413: x2 = pv[1];
414: x3 = pv[2];
415: x4 = pv[3];
416: x5 = pv[4];
417: x6 = pv[5];
418: x7 = pv[6];
419: x8 = pv[7];
420: x9 = pv[8];
421: x10 = pv[9];
422: x11 = pv[10];
423: x12 = pv[11];
424: x13 = pv[12];
425: x14 = pv[13];
426: x15 = pv[14];
427: x16 = pv[15];
428: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
429: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
430: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
431: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
433: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
434: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
435: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
436: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
438: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
439: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
440: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
441: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
443: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
444: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
445: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
446: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
447: nz = bi[row + 1] - diag_offset[row] - 1;
448: pv += 16;
449: for (j = 0; j < nz; j++) {
450: x1 = pv[0];
451: x2 = pv[1];
452: x3 = pv[2];
453: x4 = pv[3];
454: x5 = pv[4];
455: x6 = pv[5];
456: x7 = pv[6];
457: x8 = pv[7];
458: x9 = pv[8];
459: x10 = pv[9];
460: x11 = pv[10];
461: x12 = pv[11];
462: x13 = pv[12];
463: x14 = pv[13];
464: x15 = pv[14];
465: x16 = pv[15];
466: x = rtmp + 16 * pj[j];
467: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
468: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
469: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
470: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
472: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
473: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
474: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
475: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
477: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
478: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
479: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
480: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
482: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
483: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
484: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
485: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
487: pv += 16;
488: }
489: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
490: }
491: row = *ajtmp++;
492: }
493: /* finished row so stick it into b->a */
494: pv = ba + 16 * bi[i];
495: pj = bj + bi[i];
496: nz = bi[i + 1] - bi[i];
497: for (j = 0; j < nz; j++) {
498: x = rtmp + 16 * pj[j];
499: pv[0] = x[0];
500: pv[1] = x[1];
501: pv[2] = x[2];
502: pv[3] = x[3];
503: pv[4] = x[4];
504: pv[5] = x[5];
505: pv[6] = x[6];
506: pv[7] = x[7];
507: pv[8] = x[8];
508: pv[9] = x[9];
509: pv[10] = x[10];
510: pv[11] = x[11];
511: pv[12] = x[12];
512: pv[13] = x[13];
513: pv[14] = x[14];
514: pv[15] = x[15];
515: pv += 16;
516: }
517: /* invert diagonal block */
518: w = ba + 16 * diag_offset[i];
519: if (pivotinblocks) {
520: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
521: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
522: } else {
523: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
524: }
525: }
527: PetscCall(PetscFree(rtmp));
529: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
530: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
531: C->assembled = PETSC_TRUE;
533: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
534: PetscFunctionReturn(PETSC_SUCCESS);
535: }
537: /*
538: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
539: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
540: */
541: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
542: {
543: Mat C = B;
544: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
545: PetscInt i, j, k, nz, nzL, row;
546: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
547: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
548: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
549: PetscInt flg;
550: PetscReal shift;
551: PetscBool allowzeropivot, zeropivotdetected;
553: PetscFunctionBegin;
554: allowzeropivot = PetscNot(A->erroriffailure);
556: /* generate work space needed by the factorization */
557: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
558: PetscCall(PetscArrayzero(rtmp, bs2 * n));
560: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
561: shift = 0;
562: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
563: shift = info->shiftamount;
564: }
566: for (i = 0; i < n; i++) {
567: /* zero rtmp */
568: /* L part */
569: nz = bi[i + 1] - bi[i];
570: bjtmp = bj + bi[i];
571: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
573: /* U part */
574: nz = bdiag[i] - bdiag[i + 1];
575: bjtmp = bj + bdiag[i + 1] + 1;
576: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
578: /* load in initial (unfactored row) */
579: nz = ai[i + 1] - ai[i];
580: ajtmp = aj + ai[i];
581: v = aa + bs2 * ai[i];
582: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
584: /* elimination */
585: bjtmp = bj + bi[i];
586: nzL = bi[i + 1] - bi[i];
587: for (k = 0; k < nzL; k++) {
588: row = bjtmp[k];
589: pc = rtmp + bs2 * row;
590: for (flg = 0, j = 0; j < bs2; j++) {
591: if (pc[j] != 0.0) {
592: flg = 1;
593: break;
594: }
595: }
596: if (flg) {
597: pv = b->a + bs2 * bdiag[row];
598: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
599: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
601: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
602: pv = b->a + bs2 * (bdiag[row + 1] + 1);
603: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
604: for (j = 0; j < nz; j++) {
605: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
606: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
607: v = rtmp + bs2 * pj[j];
608: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
609: pv += bs2;
610: }
611: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
612: }
613: }
615: /* finished row so stick it into b->a */
616: /* L part */
617: pv = b->a + bs2 * bi[i];
618: pj = b->j + bi[i];
619: nz = bi[i + 1] - bi[i];
620: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
622: /* Mark diagonal and invert diagonal for simpler triangular solves */
623: pv = b->a + bs2 * bdiag[i];
624: pj = b->j + bdiag[i];
625: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
626: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
627: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
629: /* U part */
630: pv = b->a + bs2 * (bdiag[i + 1] + 1);
631: pj = b->j + bdiag[i + 1] + 1;
632: nz = bdiag[i] - bdiag[i + 1] - 1;
633: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
634: }
635: PetscCall(PetscFree2(rtmp, mwork));
637: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
638: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
639: C->assembled = PETSC_TRUE;
641: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
642: PetscFunctionReturn(PETSC_SUCCESS);
643: }