Actual source code: baijfact.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8: {
9: Mat C = B;
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11: IS isrow = b->row, isicol = b->icol;
12: const PetscInt *r, *ic;
13: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15: MatScalar *rtmp, *pc, *mwork, *pv;
16: MatScalar *aa = a->a, *v;
17: PetscReal shift = info->shiftamount;
18: const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
20: PetscFunctionBegin;
21: PetscCall(ISGetIndices(isrow, &r));
22: PetscCall(ISGetIndices(isicol, &ic));
24: /* generate work space needed by the factorization */
25: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26: PetscCall(PetscArrayzero(rtmp, bs2 * n));
28: for (PetscInt i = 0; i < n; i++) {
29: /* zero rtmp */
30: /* L part */
31: PetscInt *pj;
32: PetscInt nzL, nz = bi[i + 1] - bi[i];
33: bjtmp = bj + bi[i];
34: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
36: /* U part */
37: nz = bdiag[i] - bdiag[i + 1];
38: bjtmp = bj + bdiag[i + 1] + 1;
39: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
41: /* load in initial (unfactored row) */
42: nz = ai[r[i] + 1] - ai[r[i]];
43: ajtmp = aj + ai[r[i]];
44: v = aa + bs2 * ai[r[i]];
45: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
47: /* elimination */
48: bjtmp = bj + bi[i];
49: nzL = bi[i + 1] - bi[i];
50: for (PetscInt k = 0; k < nzL; k++) {
51: PetscBool flg = PETSC_FALSE;
52: PetscInt row = bjtmp[k];
54: pc = rtmp + bs2 * row;
55: for (PetscInt j = 0; j < bs2; j++) {
56: if (pc[j] != (PetscScalar)0.0) {
57: flg = PETSC_TRUE;
58: break;
59: }
60: }
61: if (flg) {
62: pv = b->a + bs2 * bdiag[row];
63: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67: pv = b->a + bs2 * (bdiag[row + 1] + 1);
68: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69: for (PetscInt j = 0; j < nz; j++) {
70: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72: v = rtmp + 4 * pj[j];
73: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74: pv += 4;
75: }
76: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77: }
78: }
80: /* finished row so stick it into b->a */
81: /* L part */
82: pv = b->a + bs2 * bi[i];
83: pj = b->j + bi[i];
84: nz = bi[i + 1] - bi[i];
85: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87: /* Mark diagonal and invert diagonal for simpler triangular solves */
88: pv = b->a + bs2 * bdiag[i];
89: pj = b->j + bdiag[i];
90: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91: {
92: PetscBool zeropivotdetected;
94: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96: }
98: /* U part */
99: pv = b->a + bs2 * (bdiag[i + 1] + 1);
100: pj = b->j + bdiag[i + 1] + 1;
101: nz = bdiag[i] - bdiag[i + 1] - 1;
102: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103: }
105: PetscCall(PetscFree2(rtmp, mwork));
106: PetscCall(ISRestoreIndices(isicol, &ic));
107: PetscCall(ISRestoreIndices(isrow, &r));
109: C->ops->solve = MatSolve_SeqBAIJ_2;
110: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111: C->assembled = PETSC_TRUE;
113: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118: {
119: Mat C = B;
120: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121: PetscInt i, j, k, nz, nzL, row, *pj;
122: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124: MatScalar *rtmp, *pc, *mwork, *pv;
125: MatScalar *aa = a->a, *v;
126: PetscInt flg;
127: PetscReal shift = info->shiftamount;
128: PetscBool allowzeropivot, zeropivotdetected;
130: PetscFunctionBegin;
131: allowzeropivot = PetscNot(A->erroriffailure);
133: /* generate work space needed by the factorization */
134: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135: PetscCall(PetscArrayzero(rtmp, bs2 * n));
137: for (i = 0; i < n; i++) {
138: /* zero rtmp */
139: /* L part */
140: nz = bi[i + 1] - bi[i];
141: bjtmp = bj + bi[i];
142: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
144: /* U part */
145: nz = bdiag[i] - bdiag[i + 1];
146: bjtmp = bj + bdiag[i + 1] + 1;
147: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
149: /* load in initial (unfactored row) */
150: nz = ai[i + 1] - ai[i];
151: ajtmp = aj + ai[i];
152: v = aa + bs2 * ai[i];
153: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
155: /* elimination */
156: bjtmp = bj + bi[i];
157: nzL = bi[i + 1] - bi[i];
158: for (k = 0; k < nzL; k++) {
159: row = bjtmp[k];
160: pc = rtmp + bs2 * row;
161: for (flg = 0, j = 0; j < bs2; j++) {
162: if (pc[j] != (PetscScalar)0.0) {
163: flg = 1;
164: break;
165: }
166: }
167: if (flg) {
168: pv = b->a + bs2 * bdiag[row];
169: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
172: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173: pv = b->a + bs2 * (bdiag[row + 1] + 1);
174: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175: for (j = 0; j < nz; j++) {
176: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178: v = rtmp + 4 * pj[j];
179: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180: pv += 4;
181: }
182: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183: }
184: }
186: /* finished row so stick it into b->a */
187: /* L part */
188: pv = b->a + bs2 * bi[i];
189: pj = b->j + bi[i];
190: nz = bi[i + 1] - bi[i];
191: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
193: /* Mark diagonal and invert diagonal for simpler triangular solves */
194: pv = b->a + bs2 * bdiag[i];
195: pj = b->j + bdiag[i];
196: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200: /* U part */
201: /*
202: pv = b->a + bs2*bi[2*n-i];
203: pj = b->j + bi[2*n-i];
204: nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205: */
206: pv = b->a + bs2 * (bdiag[i + 1] + 1);
207: pj = b->j + bdiag[i + 1] + 1;
208: nz = bdiag[i] - bdiag[i + 1] - 1;
209: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210: }
211: PetscCall(PetscFree2(rtmp, mwork));
213: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
214: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217: C->assembled = PETSC_TRUE;
219: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224: {
225: Mat C = B;
226: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227: IS isrow = b->row, isicol = b->icol;
228: const PetscInt *r, *ic;
229: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230: PetscInt *ajtmpold, *ajtmp, nz, row;
231: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
232: MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
233: MatScalar p1, p2, p3, p4;
234: MatScalar *ba = b->a, *aa = a->a;
235: PetscReal shift = info->shiftamount;
236: PetscBool allowzeropivot, zeropivotdetected;
238: PetscFunctionBegin;
239: allowzeropivot = PetscNot(A->erroriffailure);
240: PetscCall(ISGetIndices(isrow, &r));
241: PetscCall(ISGetIndices(isicol, &ic));
242: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
244: for (i = 0; i < n; i++) {
245: nz = bi[i + 1] - bi[i];
246: ajtmp = bj + bi[i];
247: for (j = 0; j < nz; j++) {
248: x = rtmp + 4 * ajtmp[j];
249: x[0] = x[1] = x[2] = x[3] = 0.0;
250: }
251: /* load in initial (unfactored row) */
252: idx = r[i];
253: nz = ai[idx + 1] - ai[idx];
254: ajtmpold = aj + ai[idx];
255: v = aa + 4 * ai[idx];
256: for (j = 0; j < nz; j++) {
257: x = rtmp + 4 * ic[ajtmpold[j]];
258: x[0] = v[0];
259: x[1] = v[1];
260: x[2] = v[2];
261: x[3] = v[3];
262: v += 4;
263: }
264: row = *ajtmp++;
265: while (row < i) {
266: pc = rtmp + 4 * row;
267: p1 = pc[0];
268: p2 = pc[1];
269: p3 = pc[2];
270: p4 = pc[3];
271: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
272: pv = ba + 4 * diag_offset[row];
273: pj = bj + diag_offset[row] + 1;
274: x1 = pv[0];
275: x2 = pv[1];
276: x3 = pv[2];
277: x4 = pv[3];
278: pc[0] = m1 = p1 * x1 + p3 * x2;
279: pc[1] = m2 = p2 * x1 + p4 * x2;
280: pc[2] = m3 = p1 * x3 + p3 * x4;
281: pc[3] = m4 = p2 * x3 + p4 * x4;
282: nz = bi[row + 1] - diag_offset[row] - 1;
283: pv += 4;
284: for (j = 0; j < nz; j++) {
285: x1 = pv[0];
286: x2 = pv[1];
287: x3 = pv[2];
288: x4 = pv[3];
289: x = rtmp + 4 * pj[j];
290: x[0] -= m1 * x1 + m3 * x2;
291: x[1] -= m2 * x1 + m4 * x2;
292: x[2] -= m1 * x3 + m3 * x4;
293: x[3] -= m2 * x3 + m4 * x4;
294: pv += 4;
295: }
296: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
297: }
298: row = *ajtmp++;
299: }
300: /* finished row so stick it into b->a */
301: pv = ba + 4 * bi[i];
302: pj = bj + bi[i];
303: nz = bi[i + 1] - bi[i];
304: for (j = 0; j < nz; j++) {
305: x = rtmp + 4 * pj[j];
306: pv[0] = x[0];
307: pv[1] = x[1];
308: pv[2] = x[2];
309: pv[3] = x[3];
310: pv += 4;
311: }
312: /* invert diagonal block */
313: w = ba + 4 * diag_offset[i];
314: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
315: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316: }
318: PetscCall(PetscFree(rtmp));
319: PetscCall(ISRestoreIndices(isicol, &ic));
320: PetscCall(ISRestoreIndices(isrow, &r));
322: C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
323: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
324: C->assembled = PETSC_TRUE;
326: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
327: PetscFunctionReturn(PETSC_SUCCESS);
328: }
329: /*
330: Version for when blocks are 2 by 2 Using natural ordering
331: */
332: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
333: {
334: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
335: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
336: PetscInt *ajtmpold, *ajtmp, nz, row;
337: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
338: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
339: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
340: MatScalar *ba = b->a, *aa = a->a;
341: PetscReal shift = info->shiftamount;
342: PetscBool allowzeropivot, zeropivotdetected;
344: PetscFunctionBegin;
345: allowzeropivot = PetscNot(A->erroriffailure);
346: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
347: for (i = 0; i < n; i++) {
348: nz = bi[i + 1] - bi[i];
349: ajtmp = bj + bi[i];
350: for (j = 0; j < nz; j++) {
351: x = rtmp + 4 * ajtmp[j];
352: x[0] = x[1] = x[2] = x[3] = 0.0;
353: }
354: /* load in initial (unfactored row) */
355: nz = ai[i + 1] - ai[i];
356: ajtmpold = aj + ai[i];
357: v = aa + 4 * ai[i];
358: for (j = 0; j < nz; j++) {
359: x = rtmp + 4 * ajtmpold[j];
360: x[0] = v[0];
361: x[1] = v[1];
362: x[2] = v[2];
363: x[3] = v[3];
364: v += 4;
365: }
366: row = *ajtmp++;
367: while (row < i) {
368: pc = rtmp + 4 * row;
369: p1 = pc[0];
370: p2 = pc[1];
371: p3 = pc[2];
372: p4 = pc[3];
373: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
374: pv = ba + 4 * diag_offset[row];
375: pj = bj + diag_offset[row] + 1;
376: x1 = pv[0];
377: x2 = pv[1];
378: x3 = pv[2];
379: x4 = pv[3];
380: pc[0] = m1 = p1 * x1 + p3 * x2;
381: pc[1] = m2 = p2 * x1 + p4 * x2;
382: pc[2] = m3 = p1 * x3 + p3 * x4;
383: pc[3] = m4 = p2 * x3 + p4 * x4;
384: nz = bi[row + 1] - diag_offset[row] - 1;
385: pv += 4;
386: for (j = 0; j < nz; j++) {
387: x1 = pv[0];
388: x2 = pv[1];
389: x3 = pv[2];
390: x4 = pv[3];
391: x = rtmp + 4 * pj[j];
392: x[0] -= m1 * x1 + m3 * x2;
393: x[1] -= m2 * x1 + m4 * x2;
394: x[2] -= m1 * x3 + m3 * x4;
395: x[3] -= m2 * x3 + m4 * x4;
396: pv += 4;
397: }
398: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
399: }
400: row = *ajtmp++;
401: }
402: /* finished row so stick it into b->a */
403: pv = ba + 4 * bi[i];
404: pj = bj + bi[i];
405: nz = bi[i + 1] - bi[i];
406: for (j = 0; j < nz; j++) {
407: x = rtmp + 4 * pj[j];
408: pv[0] = x[0];
409: pv[1] = x[1];
410: pv[2] = x[2];
411: pv[3] = x[3];
412: /*
413: printf(" col %d:",pj[j]);
414: PetscInt j1;
415: for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
416: printf("\n");
417: */
418: pv += 4;
419: }
420: /* invert diagonal block */
421: w = ba + 4 * diag_offset[i];
422: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
423: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
424: }
426: PetscCall(PetscFree(rtmp));
428: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
429: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
430: C->assembled = PETSC_TRUE;
432: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
433: PetscFunctionReturn(PETSC_SUCCESS);
434: }
436: /*
437: Version for when blocks are 1 by 1.
438: */
439: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
440: {
441: Mat C = B;
442: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
443: IS isrow = b->row, isicol = b->icol;
444: const PetscInt *r, *ic, *ics;
445: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
446: PetscInt i, j, k, nz, nzL, row, *pj;
447: const PetscInt *ajtmp, *bjtmp;
448: MatScalar *rtmp, *pc, multiplier, *pv;
449: const MatScalar *aa = a->a, *v;
450: PetscBool row_identity, col_identity;
451: FactorShiftCtx sctx;
452: const PetscInt *ddiag;
453: PetscReal rs;
454: MatScalar d;
456: PetscFunctionBegin;
457: /* MatPivotSetUp(): initialize shift context sctx */
458: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
460: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
461: ddiag = a->diag;
462: sctx.shift_top = info->zeropivot;
463: for (i = 0; i < n; i++) {
464: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
465: d = (aa)[ddiag[i]];
466: rs = -PetscAbsScalar(d) - PetscRealPart(d);
467: v = aa + ai[i];
468: nz = ai[i + 1] - ai[i];
469: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
470: if (rs > sctx.shift_top) sctx.shift_top = rs;
471: }
472: sctx.shift_top *= 1.1;
473: sctx.nshift_max = 5;
474: sctx.shift_lo = 0.;
475: sctx.shift_hi = 1.;
476: }
478: PetscCall(ISGetIndices(isrow, &r));
479: PetscCall(ISGetIndices(isicol, &ic));
480: PetscCall(PetscMalloc1(n + 1, &rtmp));
481: ics = ic;
483: do {
484: sctx.newshift = PETSC_FALSE;
485: for (i = 0; i < n; i++) {
486: /* zero rtmp */
487: /* L part */
488: nz = bi[i + 1] - bi[i];
489: bjtmp = bj + bi[i];
490: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
492: /* U part */
493: nz = bdiag[i] - bdiag[i + 1];
494: bjtmp = bj + bdiag[i + 1] + 1;
495: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
497: /* load in initial (unfactored row) */
498: nz = ai[r[i] + 1] - ai[r[i]];
499: ajtmp = aj + ai[r[i]];
500: v = aa + ai[r[i]];
501: for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
503: /* ZeropivotApply() */
504: rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
506: /* elimination */
507: bjtmp = bj + bi[i];
508: row = *bjtmp++;
509: nzL = bi[i + 1] - bi[i];
510: for (k = 0; k < nzL; k++) {
511: pc = rtmp + row;
512: if (*pc != (PetscScalar)0.0) {
513: pv = b->a + bdiag[row];
514: multiplier = *pc * (*pv);
515: *pc = multiplier;
517: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
518: pv = b->a + bdiag[row + 1] + 1;
519: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
520: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
521: PetscCall(PetscLogFlops(2.0 * nz));
522: }
523: row = *bjtmp++;
524: }
526: /* finished row so stick it into b->a */
527: rs = 0.0;
528: /* L part */
529: pv = b->a + bi[i];
530: pj = b->j + bi[i];
531: nz = bi[i + 1] - bi[i];
532: for (j = 0; j < nz; j++) {
533: pv[j] = rtmp[pj[j]];
534: rs += PetscAbsScalar(pv[j]);
535: }
537: /* U part */
538: pv = b->a + bdiag[i + 1] + 1;
539: pj = b->j + bdiag[i + 1] + 1;
540: nz = bdiag[i] - bdiag[i + 1] - 1;
541: for (j = 0; j < nz; j++) {
542: pv[j] = rtmp[pj[j]];
543: rs += PetscAbsScalar(pv[j]);
544: }
546: sctx.rs = rs;
547: sctx.pv = rtmp[i];
548: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
549: if (sctx.newshift) break; /* break for-loop */
550: rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
552: /* Mark diagonal and invert diagonal for simpler triangular solves */
553: pv = b->a + bdiag[i];
554: *pv = (PetscScalar)1.0 / rtmp[i];
556: } /* endof for (i=0; i<n; i++) { */
558: /* MatPivotRefine() */
559: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
560: /*
561: * if no shift in this attempt & shifting & started shifting & can refine,
562: * then try lower shift
563: */
564: sctx.shift_hi = sctx.shift_fraction;
565: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
566: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
567: sctx.newshift = PETSC_TRUE;
568: sctx.nshift++;
569: }
570: } while (sctx.newshift);
572: PetscCall(PetscFree(rtmp));
573: PetscCall(ISRestoreIndices(isicol, &ic));
574: PetscCall(ISRestoreIndices(isrow, &r));
576: PetscCall(ISIdentity(isrow, &row_identity));
577: PetscCall(ISIdentity(isicol, &col_identity));
578: if (row_identity && col_identity) {
579: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
580: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
581: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
582: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
583: } else {
584: C->ops->solve = MatSolve_SeqBAIJ_1;
585: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
586: }
587: C->assembled = PETSC_TRUE;
588: PetscCall(PetscLogFlops(C->cmap->n));
590: /* MatShiftView(A,info,&sctx) */
591: if (sctx.nshift) {
592: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
593: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
594: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
595: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
596: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
597: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
598: }
599: }
600: PetscFunctionReturn(PETSC_SUCCESS);
601: }
603: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
604: {
605: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
606: IS isrow = b->row, isicol = b->icol;
607: const PetscInt *r, *ic;
608: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
609: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
610: PetscInt *diag_offset = b->diag, diag, *pj;
611: MatScalar *pv, *v, *rtmp, multiplier, *pc;
612: MatScalar *ba = b->a, *aa = a->a;
613: PetscBool row_identity, col_identity;
615: PetscFunctionBegin;
616: PetscCall(ISGetIndices(isrow, &r));
617: PetscCall(ISGetIndices(isicol, &ic));
618: PetscCall(PetscMalloc1(n + 1, &rtmp));
620: for (i = 0; i < n; i++) {
621: nz = bi[i + 1] - bi[i];
622: ajtmp = bj + bi[i];
623: for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
625: /* load in initial (unfactored row) */
626: nz = ai[r[i] + 1] - ai[r[i]];
627: ajtmpold = aj + ai[r[i]];
628: v = aa + ai[r[i]];
629: for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
631: row = *ajtmp++;
632: while (row < i) {
633: pc = rtmp + row;
634: if (*pc != 0.0) {
635: pv = ba + diag_offset[row];
636: pj = bj + diag_offset[row] + 1;
637: multiplier = *pc * *pv++;
638: *pc = multiplier;
639: nz = bi[row + 1] - diag_offset[row] - 1;
640: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
641: PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
642: }
643: row = *ajtmp++;
644: }
645: /* finished row so stick it into b->a */
646: pv = ba + bi[i];
647: pj = bj + bi[i];
648: nz = bi[i + 1] - bi[i];
649: for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
650: diag = diag_offset[i] - bi[i];
651: /* check pivot entry for current row */
652: PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
653: pv[diag] = 1.0 / pv[diag];
654: }
656: PetscCall(PetscFree(rtmp));
657: PetscCall(ISRestoreIndices(isicol, &ic));
658: PetscCall(ISRestoreIndices(isrow, &r));
659: PetscCall(ISIdentity(isrow, &row_identity));
660: PetscCall(ISIdentity(isicol, &col_identity));
661: if (row_identity && col_identity) {
662: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
663: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
664: } else {
665: C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
666: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
667: }
668: C->assembled = PETSC_TRUE;
669: PetscCall(PetscLogFlops(C->cmap->n));
670: PetscFunctionReturn(PETSC_SUCCESS);
671: }
673: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
674: {
675: PetscFunctionBegin;
676: *type = MATSOLVERPETSC;
677: PetscFunctionReturn(PETSC_SUCCESS);
678: }
680: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
681: {
682: PetscInt n = A->rmap->n;
684: PetscFunctionBegin;
685: #if defined(PETSC_USE_COMPLEX)
686: PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
687: #endif
688: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
689: PetscCall(MatSetSizes(*B, n, n, n, n));
690: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
691: PetscCall(MatSetType(*B, MATSEQBAIJ));
693: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
694: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
695: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
696: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
697: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
698: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
699: PetscCall(MatSetType(*B, MATSEQSBAIJ));
700: PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
702: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
703: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
704: /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
705: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
706: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
707: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
708: (*B)->factortype = ftype;
709: (*B)->canuseordering = PETSC_TRUE;
711: PetscCall(PetscFree((*B)->solvertype));
712: PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
713: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
718: {
719: Mat C;
721: PetscFunctionBegin;
722: PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
723: PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
724: PetscCall(MatLUFactorNumeric(C, A, info));
726: A->ops->solve = C->ops->solve;
727: A->ops->solvetranspose = C->ops->solvetranspose;
729: PetscCall(MatHeaderMerge(A, &C));
730: PetscFunctionReturn(PETSC_SUCCESS);
731: }
733: #include <../src/mat/impls/sbaij/seq/sbaij.h>
734: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
735: {
736: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
737: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
738: IS ip = b->row;
739: const PetscInt *rip;
740: PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
741: PetscInt *ai = a->i, *aj = a->j;
742: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
743: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
744: PetscReal rs;
745: FactorShiftCtx sctx;
747: PetscFunctionBegin;
748: if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
749: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
750: PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
751: PetscCall(MatDestroy(&a->sbaijMat));
752: PetscFunctionReturn(PETSC_SUCCESS);
753: }
755: /* MatPivotSetUp(): initialize shift context sctx */
756: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
758: PetscCall(ISGetIndices(ip, &rip));
759: PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
761: sctx.shift_amount = 0.;
762: sctx.nshift = 0;
763: do {
764: sctx.newshift = PETSC_FALSE;
765: for (i = 0; i < mbs; i++) {
766: rtmp[i] = 0.0;
767: jl[i] = mbs;
768: il[0] = 0;
769: }
771: for (k = 0; k < mbs; k++) {
772: bval = ba + bi[k];
773: /* initialize k-th row by the perm[k]-th row of A */
774: jmin = ai[rip[k]];
775: jmax = ai[rip[k] + 1];
776: for (j = jmin; j < jmax; j++) {
777: col = rip[aj[j]];
778: if (col >= k) { /* only take upper triangular entry */
779: rtmp[col] = aa[j];
780: *bval++ = 0.0; /* for in-place factorization */
781: }
782: }
784: /* shift the diagonal of the matrix */
785: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
787: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
788: dk = rtmp[k];
789: i = jl[k]; /* first row to be added to k_th row */
791: while (i < k) {
792: nexti = jl[i]; /* next row to be added to k_th row */
794: /* compute multiplier, update diag(k) and U(i,k) */
795: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
796: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
797: dk += uikdi * ba[ili];
798: ba[ili] = uikdi; /* -U(i,k) */
800: /* add multiple of row i to k-th row */
801: jmin = ili + 1;
802: jmax = bi[i + 1];
803: if (jmin < jmax) {
804: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
805: /* update il and jl for row i */
806: il[i] = jmin;
807: j = bj[jmin];
808: jl[i] = jl[j];
809: jl[j] = i;
810: }
811: i = nexti;
812: }
814: /* shift the diagonals when zero pivot is detected */
815: /* compute rs=sum of abs(off-diagonal) */
816: rs = 0.0;
817: jmin = bi[k] + 1;
818: nz = bi[k + 1] - jmin;
819: if (nz) {
820: bcol = bj + jmin;
821: while (nz--) {
822: rs += PetscAbsScalar(rtmp[*bcol]);
823: bcol++;
824: }
825: }
827: sctx.rs = rs;
828: sctx.pv = dk;
829: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
830: if (sctx.newshift) break;
831: dk = sctx.pv;
833: /* copy data into U(k,:) */
834: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
835: jmin = bi[k] + 1;
836: jmax = bi[k + 1];
837: if (jmin < jmax) {
838: for (j = jmin; j < jmax; j++) {
839: col = bj[j];
840: ba[j] = rtmp[col];
841: rtmp[col] = 0.0;
842: }
843: /* add the k-th row into il and jl */
844: il[k] = jmin;
845: i = bj[jmin];
846: jl[k] = jl[i];
847: jl[i] = k;
848: }
849: }
850: } while (sctx.newshift);
851: PetscCall(PetscFree3(rtmp, il, jl));
853: PetscCall(ISRestoreIndices(ip, &rip));
855: C->assembled = PETSC_TRUE;
856: C->preallocated = PETSC_TRUE;
858: PetscCall(PetscLogFlops(C->rmap->N));
859: if (sctx.nshift) {
860: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
861: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
862: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
863: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
864: }
865: }
866: PetscFunctionReturn(PETSC_SUCCESS);
867: }
869: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
870: {
871: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
872: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
873: PetscInt i, j, am = a->mbs;
874: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
875: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
876: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
877: PetscReal rs;
878: FactorShiftCtx sctx;
880: PetscFunctionBegin;
881: /* MatPivotSetUp(): initialize shift context sctx */
882: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
884: PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
886: do {
887: sctx.newshift = PETSC_FALSE;
888: for (i = 0; i < am; i++) {
889: rtmp[i] = 0.0;
890: jl[i] = am;
891: il[0] = 0;
892: }
894: for (k = 0; k < am; k++) {
895: /* initialize k-th row with elements nonzero in row perm(k) of A */
896: nz = ai[k + 1] - ai[k];
897: acol = aj + ai[k];
898: aval = aa + ai[k];
899: bval = ba + bi[k];
900: while (nz--) {
901: if (*acol < k) { /* skip lower triangular entries */
902: acol++;
903: aval++;
904: } else {
905: rtmp[*acol++] = *aval++;
906: *bval++ = 0.0; /* for in-place factorization */
907: }
908: }
910: /* shift the diagonal of the matrix */
911: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
913: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
914: dk = rtmp[k];
915: i = jl[k]; /* first row to be added to k_th row */
917: while (i < k) {
918: nexti = jl[i]; /* next row to be added to k_th row */
919: /* compute multiplier, update D(k) and U(i,k) */
920: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
921: uikdi = -ba[ili] * ba[bi[i]];
922: dk += uikdi * ba[ili];
923: ba[ili] = uikdi; /* -U(i,k) */
925: /* add multiple of row i to k-th row ... */
926: jmin = ili + 1;
927: nz = bi[i + 1] - jmin;
928: if (nz > 0) {
929: bcol = bj + jmin;
930: bval = ba + jmin;
931: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
932: /* update il and jl for i-th row */
933: il[i] = jmin;
934: j = bj[jmin];
935: jl[i] = jl[j];
936: jl[j] = i;
937: }
938: i = nexti;
939: }
941: /* shift the diagonals when zero pivot is detected */
942: /* compute rs=sum of abs(off-diagonal) */
943: rs = 0.0;
944: jmin = bi[k] + 1;
945: nz = bi[k + 1] - jmin;
946: if (nz) {
947: bcol = bj + jmin;
948: while (nz--) {
949: rs += PetscAbsScalar(rtmp[*bcol]);
950: bcol++;
951: }
952: }
954: sctx.rs = rs;
955: sctx.pv = dk;
956: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
957: if (sctx.newshift) break; /* sctx.shift_amount is updated */
958: dk = sctx.pv;
960: /* copy data into U(k,:) */
961: ba[bi[k]] = 1.0 / dk;
962: jmin = bi[k] + 1;
963: nz = bi[k + 1] - jmin;
964: if (nz) {
965: bcol = bj + jmin;
966: bval = ba + jmin;
967: while (nz--) {
968: *bval++ = rtmp[*bcol];
969: rtmp[*bcol++] = 0.0;
970: }
971: /* add k-th row into il and jl */
972: il[k] = jmin;
973: i = bj[jmin];
974: jl[k] = jl[i];
975: jl[i] = k;
976: }
977: }
978: } while (sctx.newshift);
979: PetscCall(PetscFree3(rtmp, il, jl));
981: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
982: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983: C->assembled = PETSC_TRUE;
984: C->preallocated = PETSC_TRUE;
986: PetscCall(PetscLogFlops(C->rmap->N));
987: if (sctx.nshift) {
988: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
989: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
990: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
991: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
992: }
993: }
994: PetscFunctionReturn(PETSC_SUCCESS);
995: }
997: #include <petscbt.h>
998: #include <../src/mat/utils/freespace.h>
999: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1000: {
1001: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1002: Mat_SeqSBAIJ *b;
1003: Mat B;
1004: PetscBool perm_identity, missing;
1005: PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1006: const PetscInt *rip;
1007: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1008: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1009: PetscReal fill = info->fill, levels = info->levels;
1010: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1011: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1012: PetscBT lnkbt;
1014: PetscFunctionBegin;
1015: PetscCall(MatMissingDiagonal(A, &missing, &i));
1016: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1018: if (bs > 1) {
1019: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1020: (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1022: PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1023: PetscFunctionReturn(PETSC_SUCCESS);
1024: }
1026: PetscCall(ISIdentity(perm, &perm_identity));
1027: PetscCall(ISGetIndices(perm, &rip));
1029: /* special case that simply copies fill pattern */
1030: if (!levels && perm_identity) {
1031: PetscCall(PetscMalloc1(am + 1, &ui));
1032: for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1033: B = fact;
1034: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1036: b = (Mat_SeqSBAIJ *)B->data;
1037: uj = b->j;
1038: for (i = 0; i < am; i++) {
1039: aj = a->j + a->diag[i];
1040: for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1041: b->ilen[i] = ui[i];
1042: }
1043: PetscCall(PetscFree(ui));
1045: B->factortype = MAT_FACTOR_NONE;
1047: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1048: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1049: B->factortype = MAT_FACTOR_ICC;
1051: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1052: PetscFunctionReturn(PETSC_SUCCESS);
1053: }
1055: /* initialization */
1056: PetscCall(PetscMalloc1(am + 1, &ui));
1057: ui[0] = 0;
1058: PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1060: /* jl: linked list for storing indices of the pivot rows
1061: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1062: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1063: for (i = 0; i < am; i++) {
1064: jl[i] = am;
1065: il[i] = 0;
1066: }
1068: /* create and initialize a linked list for storing column indices of the active row k */
1069: nlnk = am + 1;
1070: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1072: /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1073: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1075: current_space = free_space;
1077: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1078: current_space_lvl = free_space_lvl;
1080: for (k = 0; k < am; k++) { /* for each active row k */
1081: /* initialize lnk by the column indices of row rip[k] of A */
1082: nzk = 0;
1083: ncols = ai[rip[k] + 1] - ai[rip[k]];
1084: ncols_upper = 0;
1085: cols = cols_lvl + am;
1086: for (j = 0; j < ncols; j++) {
1087: i = rip[*(aj + ai[rip[k]] + j)];
1088: if (i >= k) { /* only take upper triangular entry */
1089: cols[ncols_upper] = i;
1090: cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1091: ncols_upper++;
1092: }
1093: }
1094: PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1095: nzk += nlnk;
1097: /* update lnk by computing fill-in for each pivot row to be merged in */
1098: prow = jl[k]; /* 1st pivot row */
1100: while (prow < k) {
1101: nextprow = jl[prow];
1103: /* merge prow into k-th row */
1104: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1105: jmax = ui[prow + 1];
1106: ncols = jmax - jmin;
1107: i = jmin - ui[prow];
1108: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1109: for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1110: PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1111: nzk += nlnk;
1113: /* update il and jl for prow */
1114: if (jmin < jmax) {
1115: il[prow] = jmin;
1117: j = *cols;
1118: jl[prow] = jl[j];
1119: jl[j] = prow;
1120: }
1121: prow = nextprow;
1122: }
1124: /* if free space is not available, make more free space */
1125: if (current_space->local_remaining < nzk) {
1126: i = am - k + 1; /* num of unfactored rows */
1127: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1128: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1129: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
1130: reallocs++;
1131: }
1133: /* copy data into free_space and free_space_lvl, then initialize lnk */
1134: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1136: /* add the k-th row into il and jl */
1137: if (nzk - 1 > 0) {
1138: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1139: jl[k] = jl[i];
1140: jl[i] = k;
1141: il[k] = ui[k] + 1;
1142: }
1143: uj_ptr[k] = current_space->array;
1144: uj_lvl_ptr[k] = current_space_lvl->array;
1146: current_space->array += nzk;
1147: current_space->local_used += nzk;
1148: current_space->local_remaining -= nzk;
1150: current_space_lvl->array += nzk;
1151: current_space_lvl->local_used += nzk;
1152: current_space_lvl->local_remaining -= nzk;
1154: ui[k + 1] = ui[k] + nzk;
1155: }
1157: PetscCall(ISRestoreIndices(perm, &rip));
1158: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1159: PetscCall(PetscFree(cols_lvl));
1161: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1162: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1163: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1164: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1165: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1167: /* put together the new matrix in MATSEQSBAIJ format */
1168: B = fact;
1169: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1171: b = (Mat_SeqSBAIJ *)B->data;
1172: b->singlemalloc = PETSC_FALSE;
1173: b->free_a = PETSC_TRUE;
1174: b->free_ij = PETSC_TRUE;
1176: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1178: b->j = uj;
1179: b->i = ui;
1180: b->diag = NULL;
1181: b->ilen = NULL;
1182: b->imax = NULL;
1183: b->row = perm;
1184: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1186: PetscCall(PetscObjectReference((PetscObject)perm));
1188: b->icol = perm;
1190: PetscCall(PetscObjectReference((PetscObject)perm));
1191: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1193: b->maxnz = b->nz = ui[am];
1195: B->info.factor_mallocs = reallocs;
1196: B->info.fill_ratio_given = fill;
1197: if (ai[am] != 0.) {
1198: /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1199: B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1200: } else {
1201: B->info.fill_ratio_needed = 0.0;
1202: }
1203: #if defined(PETSC_USE_INFO)
1204: if (ai[am] != 0) {
1205: PetscReal af = B->info.fill_ratio_needed;
1206: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1207: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1208: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1209: } else {
1210: PetscCall(PetscInfo(A, "Empty matrix\n"));
1211: }
1212: #endif
1213: if (perm_identity) {
1214: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1215: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1217: } else {
1218: (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1219: }
1220: PetscFunctionReturn(PETSC_SUCCESS);
1221: }
1223: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1224: {
1225: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1226: Mat_SeqSBAIJ *b;
1227: Mat B;
1228: PetscBool perm_identity, missing;
1229: PetscReal fill = info->fill;
1230: const PetscInt *rip;
1231: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1232: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1233: PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1234: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1235: PetscBT lnkbt;
1237: PetscFunctionBegin;
1238: if (bs > 1) { /* convert to seqsbaij */
1239: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1240: (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1242: PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1243: PetscFunctionReturn(PETSC_SUCCESS);
1244: }
1246: PetscCall(MatMissingDiagonal(A, &missing, &i));
1247: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1249: /* check whether perm is the identity mapping */
1250: PetscCall(ISIdentity(perm, &perm_identity));
1251: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1252: PetscCall(ISGetIndices(perm, &rip));
1254: /* initialization */
1255: PetscCall(PetscMalloc1(mbs + 1, &ui));
1256: ui[0] = 0;
1258: /* jl: linked list for storing indices of the pivot rows
1259: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1260: PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1261: for (i = 0; i < mbs; i++) {
1262: jl[i] = mbs;
1263: il[i] = 0;
1264: }
1266: /* create and initialize a linked list for storing column indices of the active row k */
1267: nlnk = mbs + 1;
1268: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1270: /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1271: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1273: current_space = free_space;
1275: for (k = 0; k < mbs; k++) { /* for each active row k */
1276: /* initialize lnk by the column indices of row rip[k] of A */
1277: nzk = 0;
1278: ncols = ai[rip[k] + 1] - ai[rip[k]];
1279: ncols_upper = 0;
1280: for (j = 0; j < ncols; j++) {
1281: i = rip[*(aj + ai[rip[k]] + j)];
1282: if (i >= k) { /* only take upper triangular entry */
1283: cols[ncols_upper] = i;
1284: ncols_upper++;
1285: }
1286: }
1287: PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1288: nzk += nlnk;
1290: /* update lnk by computing fill-in for each pivot row to be merged in */
1291: prow = jl[k]; /* 1st pivot row */
1293: while (prow < k) {
1294: nextprow = jl[prow];
1295: /* merge prow into k-th row */
1296: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1297: jmax = ui[prow + 1];
1298: ncols = jmax - jmin;
1299: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1300: PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1301: nzk += nlnk;
1303: /* update il and jl for prow */
1304: if (jmin < jmax) {
1305: il[prow] = jmin;
1306: j = *uj_ptr;
1307: jl[prow] = jl[j];
1308: jl[j] = prow;
1309: }
1310: prow = nextprow;
1311: }
1313: /* if free space is not available, make more free space */
1314: if (current_space->local_remaining < nzk) {
1315: i = mbs - k + 1; /* num of unfactored rows */
1316: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1317: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1318: reallocs++;
1319: }
1321: /* copy data into free space, then initialize lnk */
1322: PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1324: /* add the k-th row into il and jl */
1325: if (nzk - 1 > 0) {
1326: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1327: jl[k] = jl[i];
1328: jl[i] = k;
1329: il[k] = ui[k] + 1;
1330: }
1331: ui_ptr[k] = current_space->array;
1332: current_space->array += nzk;
1333: current_space->local_used += nzk;
1334: current_space->local_remaining -= nzk;
1336: ui[k + 1] = ui[k] + nzk;
1337: }
1339: PetscCall(ISRestoreIndices(perm, &rip));
1340: PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1342: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1343: PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1344: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1345: PetscCall(PetscLLDestroy(lnk, lnkbt));
1347: /* put together the new matrix in MATSEQSBAIJ format */
1348: B = fact;
1349: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1351: b = (Mat_SeqSBAIJ *)B->data;
1352: b->singlemalloc = PETSC_FALSE;
1353: b->free_a = PETSC_TRUE;
1354: b->free_ij = PETSC_TRUE;
1356: PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1358: b->j = uj;
1359: b->i = ui;
1360: b->diag = NULL;
1361: b->ilen = NULL;
1362: b->imax = NULL;
1363: b->row = perm;
1364: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1366: PetscCall(PetscObjectReference((PetscObject)perm));
1367: b->icol = perm;
1368: PetscCall(PetscObjectReference((PetscObject)perm));
1369: PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1370: b->maxnz = b->nz = ui[mbs];
1372: B->info.factor_mallocs = reallocs;
1373: B->info.fill_ratio_given = fill;
1374: if (ai[mbs] != 0.) {
1375: /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1376: B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1377: } else {
1378: B->info.fill_ratio_needed = 0.0;
1379: }
1380: #if defined(PETSC_USE_INFO)
1381: if (ai[mbs] != 0.) {
1382: PetscReal af = B->info.fill_ratio_needed;
1383: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1384: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1385: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1386: } else {
1387: PetscCall(PetscInfo(A, "Empty matrix\n"));
1388: }
1389: #endif
1390: if (perm_identity) {
1391: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1392: } else {
1393: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1394: }
1395: PetscFunctionReturn(PETSC_SUCCESS);
1396: }
1398: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1399: {
1400: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1401: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1402: PetscInt i, k, n = a->mbs;
1403: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1404: const MatScalar *aa = a->a, *v;
1405: PetscScalar *x, *s, *t, *ls;
1406: const PetscScalar *b;
1408: PetscFunctionBegin;
1409: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1410: PetscCall(VecGetArrayRead(bb, &b));
1411: PetscCall(VecGetArray(xx, &x));
1412: t = a->solve_work;
1414: /* forward solve the lower triangular */
1415: PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1417: for (i = 1; i < n; i++) {
1418: v = aa + bs2 * ai[i];
1419: vi = aj + ai[i];
1420: nz = ai[i + 1] - ai[i];
1421: s = t + bs * i;
1422: PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1423: for (k = 0; k < nz; k++) {
1424: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1425: v += bs2;
1426: }
1427: }
1429: /* backward solve the upper triangular */
1430: ls = a->solve_work + A->cmap->n;
1431: for (i = n - 1; i >= 0; i--) {
1432: v = aa + bs2 * (adiag[i + 1] + 1);
1433: vi = aj + adiag[i + 1] + 1;
1434: nz = adiag[i] - adiag[i + 1] - 1;
1435: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1436: for (k = 0; k < nz; k++) {
1437: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1438: v += bs2;
1439: }
1440: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1441: PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1442: }
1444: PetscCall(VecRestoreArrayRead(bb, &b));
1445: PetscCall(VecRestoreArray(xx, &x));
1446: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1447: PetscFunctionReturn(PETSC_SUCCESS);
1448: }
1450: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1451: {
1452: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1453: IS iscol = a->col, isrow = a->row;
1454: const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1455: PetscInt i, m, n = a->mbs;
1456: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1457: const MatScalar *aa = a->a, *v;
1458: PetscScalar *x, *s, *t, *ls;
1459: const PetscScalar *b;
1461: PetscFunctionBegin;
1462: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1463: PetscCall(VecGetArrayRead(bb, &b));
1464: PetscCall(VecGetArray(xx, &x));
1465: t = a->solve_work;
1467: PetscCall(ISGetIndices(isrow, &rout));
1468: r = rout;
1469: PetscCall(ISGetIndices(iscol, &cout));
1470: c = cout;
1472: /* forward solve the lower triangular */
1473: PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1474: for (i = 1; i < n; i++) {
1475: v = aa + bs2 * ai[i];
1476: vi = aj + ai[i];
1477: nz = ai[i + 1] - ai[i];
1478: s = t + bs * i;
1479: PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1480: for (m = 0; m < nz; m++) {
1481: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1482: v += bs2;
1483: }
1484: }
1486: /* backward solve the upper triangular */
1487: ls = a->solve_work + A->cmap->n;
1488: for (i = n - 1; i >= 0; i--) {
1489: v = aa + bs2 * (adiag[i + 1] + 1);
1490: vi = aj + adiag[i + 1] + 1;
1491: nz = adiag[i] - adiag[i + 1] - 1;
1492: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1493: for (m = 0; m < nz; m++) {
1494: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1495: v += bs2;
1496: }
1497: PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1498: PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1499: }
1500: PetscCall(ISRestoreIndices(isrow, &rout));
1501: PetscCall(ISRestoreIndices(iscol, &cout));
1502: PetscCall(VecRestoreArrayRead(bb, &b));
1503: PetscCall(VecRestoreArray(xx, &x));
1504: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: /*
1509: For each block in an block array saves the largest absolute value in the block into another array
1510: */
1511: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1512: {
1513: PetscInt i, j;
1515: PetscFunctionBegin;
1516: PetscCall(PetscArrayzero(absarray, nbs + 1));
1517: for (i = 0; i < nbs; i++) {
1518: for (j = 0; j < bs2; j++) {
1519: if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1520: }
1521: }
1522: PetscFunctionReturn(PETSC_SUCCESS);
1523: }
1525: /*
1526: This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1527: */
1528: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1529: {
1530: Mat B = *fact;
1531: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
1532: IS isicol;
1533: const PetscInt *r, *ic;
1534: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1535: PetscInt *bi, *bj, *bdiag;
1537: PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1538: PetscInt nlnk, *lnk;
1539: PetscBT lnkbt;
1540: PetscBool row_identity, icol_identity;
1541: MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1542: PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1544: PetscReal dt = info->dt; /* shift=info->shiftamount; */
1545: PetscInt nnz_max;
1546: PetscBool missing;
1547: PetscReal *vtmp_abs;
1548: MatScalar *v_work;
1549: PetscInt *v_pivots;
1550: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1552: PetscFunctionBegin;
1553: /* symbolic factorization, can be reused */
1554: PetscCall(MatMissingDiagonal(A, &missing, &i));
1555: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1556: adiag = a->diag;
1558: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1560: /* bdiag is location of diagonal in factor */
1561: PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1563: /* allocate row pointers bi */
1564: PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1566: /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1567: dtcount = (PetscInt)info->dtcount;
1568: if (dtcount > mbs - 1) dtcount = mbs - 1;
1569: nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1570: /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1571: PetscCall(PetscMalloc1(nnz_max, &bj));
1572: nnz_max = nnz_max * bs2;
1573: PetscCall(PetscMalloc1(nnz_max, &ba));
1575: /* put together the new matrix */
1576: PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1578: b = (Mat_SeqBAIJ *)(B)->data;
1579: b->free_a = PETSC_TRUE;
1580: b->free_ij = PETSC_TRUE;
1581: b->singlemalloc = PETSC_FALSE;
1583: b->a = ba;
1584: b->j = bj;
1585: b->i = bi;
1586: b->diag = bdiag;
1587: b->ilen = NULL;
1588: b->imax = NULL;
1589: b->row = isrow;
1590: b->col = iscol;
1592: PetscCall(PetscObjectReference((PetscObject)isrow));
1593: PetscCall(PetscObjectReference((PetscObject)iscol));
1595: b->icol = isicol;
1596: PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1597: b->maxnz = nnz_max / bs2;
1599: (B)->factortype = MAT_FACTOR_ILUDT;
1600: (B)->info.factor_mallocs = 0;
1601: (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1602: /* end of symbolic factorization */
1603: PetscCall(ISGetIndices(isrow, &r));
1604: PetscCall(ISGetIndices(isicol, &ic));
1606: /* linked list for storing column indices of the active row */
1607: nlnk = mbs + 1;
1608: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1610: /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1611: PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1612: /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1613: PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1614: PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1615: PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1617: allowzeropivot = PetscNot(A->erroriffailure);
1618: bi[0] = 0;
1619: bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1620: bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */
1621: for (i = 0; i < mbs; i++) {
1622: /* copy initial fill into linked list */
1623: nzi = ai[r[i] + 1] - ai[r[i]];
1624: PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1625: nzi_al = adiag[r[i]] - ai[r[i]];
1626: nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1628: /* load in initial unfactored row */
1629: ajtmp = aj + ai[r[i]];
1630: PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1631: PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1632: aatmp = a->a + bs2 * ai[r[i]];
1633: for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1635: /* add pivot rows into linked list */
1636: row = lnk[mbs];
1637: while (row < i) {
1638: nzi_bl = bi[row + 1] - bi[row] + 1;
1639: bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1640: PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1641: nzi += nlnk;
1642: row = lnk[row];
1643: }
1645: /* copy data from lnk into jtmp, then initialize lnk */
1646: PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1648: /* numerical factorization */
1649: bjtmp = jtmp;
1650: row = *bjtmp++; /* 1st pivot row */
1652: while (row < i) {
1653: pc = rtmp + bs2 * row;
1654: pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */
1655: PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1656: PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1657: if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */
1658: pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1659: pv = ba + bs2 * (bdiag[row + 1] + 1);
1660: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1661: for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1662: /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1663: }
1664: row = *bjtmp++;
1665: }
1667: /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1668: nzi_bl = 0;
1669: j = 0;
1670: while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1671: PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1672: nzi_bl++;
1673: j++;
1674: }
1675: nzi_bu = nzi - nzi_bl - 1;
1677: while (j < nzi) { /* U-part */
1678: PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1679: j++;
1680: }
1682: PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1684: bjtmp = bj + bi[i];
1685: batmp = ba + bs2 * bi[i];
1686: /* apply level dropping rule to L part */
1687: ncut = nzi_al + dtcount;
1688: if (ncut < nzi_bl) {
1689: PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1690: PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1691: } else {
1692: ncut = nzi_bl;
1693: }
1694: for (j = 0; j < ncut; j++) {
1695: bjtmp[j] = jtmp[j];
1696: PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1697: }
1698: bi[i + 1] = bi[i] + ncut;
1699: nzi = ncut + 1;
1701: /* apply level dropping rule to U part */
1702: ncut = nzi_au + dtcount;
1703: if (ncut < nzi_bu) {
1704: PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1705: PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1706: } else {
1707: ncut = nzi_bu;
1708: }
1709: nzi += ncut;
1711: /* mark bdiagonal */
1712: bdiag[i + 1] = bdiag[i] - (ncut + 1);
1713: bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1715: bjtmp = bj + bdiag[i];
1716: batmp = ba + bs2 * bdiag[i];
1717: PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1718: *bjtmp = i;
1720: bjtmp = bj + bdiag[i + 1] + 1;
1721: batmp = ba + (bdiag[i + 1] + 1) * bs2;
1723: for (k = 0; k < ncut; k++) {
1724: bjtmp[k] = jtmp[nzi_bl + 1 + k];
1725: PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1726: }
1728: im[i] = nzi; /* used by PetscLLAddSortedLU() */
1730: /* invert diagonal block for simpler triangular solves - add shift??? */
1731: batmp = ba + bs2 * bdiag[i];
1733: PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1734: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1735: } /* for (i=0; i<mbs; i++) */
1736: PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1738: /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1739: PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1741: PetscCall(ISRestoreIndices(isrow, &r));
1742: PetscCall(ISRestoreIndices(isicol, &ic));
1744: PetscCall(PetscLLDestroy(lnk, lnkbt));
1746: PetscCall(PetscFree2(im, jtmp));
1747: PetscCall(PetscFree2(rtmp, vtmp));
1749: PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1750: b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1752: PetscCall(ISIdentity(isrow, &row_identity));
1753: PetscCall(ISIdentity(isicol, &icol_identity));
1754: if (row_identity && icol_identity) {
1755: B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1756: } else {
1757: B->ops->solve = MatSolve_SeqBAIJ_N;
1758: }
1760: B->ops->solveadd = NULL;
1761: B->ops->solvetranspose = NULL;
1762: B->ops->solvetransposeadd = NULL;
1763: B->ops->matsolve = NULL;
1764: B->assembled = PETSC_TRUE;
1765: B->preallocated = PETSC_TRUE;
1766: PetscFunctionReturn(PETSC_SUCCESS);
1767: }