Actual source code: baijfact.c

  1: /*
  2:     Factorization code for BAIJ format.
  3: */
  4: #include <../src/mat/impls/baij/seq/baij.h>
  5: #include <petsc/private/kernels/blockinvert.h>

  7: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
  8: {
  9:   Mat             C = B;
 10:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 11:   IS              isrow = b->row, isicol = b->icol;
 12:   const PetscInt *r, *ic;
 13:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
 14:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
 15:   MatScalar      *rtmp, *pc, *mwork, *pv;
 16:   MatScalar      *aa             = a->a, *v;
 17:   PetscReal       shift          = info->shiftamount;
 18:   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);

 20:   PetscFunctionBegin;
 21:   PetscCall(ISGetIndices(isrow, &r));
 22:   PetscCall(ISGetIndices(isicol, &ic));

 24:   /* generate work space needed by the factorization */
 25:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
 26:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

 28:   for (PetscInt i = 0; i < n; i++) {
 29:     /* zero rtmp */
 30:     /* L part */
 31:     PetscInt *pj;
 32:     PetscInt  nzL, nz = bi[i + 1] - bi[i];
 33:     bjtmp = bj + bi[i];
 34:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 36:     /* U part */
 37:     nz    = bdiag[i] - bdiag[i + 1];
 38:     bjtmp = bj + bdiag[i + 1] + 1;
 39:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 41:     /* load in initial (unfactored row) */
 42:     nz    = ai[r[i] + 1] - ai[r[i]];
 43:     ajtmp = aj + ai[r[i]];
 44:     v     = aa + bs2 * ai[r[i]];
 45:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

 47:     /* elimination */
 48:     bjtmp = bj + bi[i];
 49:     nzL   = bi[i + 1] - bi[i];
 50:     for (PetscInt k = 0; k < nzL; k++) {
 51:       PetscBool flg = PETSC_FALSE;
 52:       PetscInt  row = bjtmp[k];

 54:       pc = rtmp + bs2 * row;
 55:       for (PetscInt j = 0; j < bs2; j++) {
 56:         if (pc[j] != (PetscScalar)0.0) {
 57:           flg = PETSC_TRUE;
 58:           break;
 59:         }
 60:       }
 61:       if (flg) {
 62:         pv = b->a + bs2 * bdiag[row];
 63:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
 64:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

 66:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
 67:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
 68:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
 69:         for (PetscInt j = 0; j < nz; j++) {
 70:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
 71:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
 72:           v = rtmp + 4 * pj[j];
 73:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
 74:           pv += 4;
 75:         }
 76:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
 77:       }
 78:     }

 80:     /* finished row so stick it into b->a */
 81:     /* L part */
 82:     pv = b->a + bs2 * bi[i];
 83:     pj = b->j + bi[i];
 84:     nz = bi[i + 1] - bi[i];
 85:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

 87:     /* Mark diagonal and invert diagonal for simpler triangular solves */
 88:     pv = b->a + bs2 * bdiag[i];
 89:     pj = b->j + bdiag[i];
 90:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
 91:     {
 92:       PetscBool zeropivotdetected;

 94:       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
 95:       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
 96:     }

 98:     /* U part */
 99:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100:     pj = b->j + bdiag[i + 1] + 1;
101:     nz = bdiag[i] - bdiag[i + 1] - 1;
102:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103:   }

105:   PetscCall(PetscFree2(rtmp, mwork));
106:   PetscCall(ISRestoreIndices(isicol, &ic));
107:   PetscCall(ISRestoreIndices(isrow, &r));

109:   C->ops->solve          = MatSolve_SeqBAIJ_2;
110:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111:   C->assembled           = PETSC_TRUE;

113:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114:   PetscFunctionReturn(PETSC_SUCCESS);
115: }

117: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118: {
119:   Mat             C = B;
120:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121:   PetscInt        i, j, k, nz, nzL, row, *pj;
122:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124:   MatScalar      *rtmp, *pc, *mwork, *pv;
125:   MatScalar      *aa = a->a, *v;
126:   PetscInt        flg;
127:   PetscReal       shift = info->shiftamount;
128:   PetscBool       allowzeropivot, zeropivotdetected;

130:   PetscFunctionBegin;
131:   allowzeropivot = PetscNot(A->erroriffailure);

133:   /* generate work space needed by the factorization */
134:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

137:   for (i = 0; i < n; i++) {
138:     /* zero rtmp */
139:     /* L part */
140:     nz    = bi[i + 1] - bi[i];
141:     bjtmp = bj + bi[i];
142:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

144:     /* U part */
145:     nz    = bdiag[i] - bdiag[i + 1];
146:     bjtmp = bj + bdiag[i + 1] + 1;
147:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

149:     /* load in initial (unfactored row) */
150:     nz    = ai[i + 1] - ai[i];
151:     ajtmp = aj + ai[i];
152:     v     = aa + bs2 * ai[i];
153:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

155:     /* elimination */
156:     bjtmp = bj + bi[i];
157:     nzL   = bi[i + 1] - bi[i];
158:     for (k = 0; k < nzL; k++) {
159:       row = bjtmp[k];
160:       pc  = rtmp + bs2 * row;
161:       for (flg = 0, j = 0; j < bs2; j++) {
162:         if (pc[j] != (PetscScalar)0.0) {
163:           flg = 1;
164:           break;
165:         }
166:       }
167:       if (flg) {
168:         pv = b->a + bs2 * bdiag[row];
169:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

172:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175:         for (j = 0; j < nz; j++) {
176:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178:           v = rtmp + 4 * pj[j];
179:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180:           pv += 4;
181:         }
182:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183:       }
184:     }

186:     /* finished row so stick it into b->a */
187:     /* L part */
188:     pv = b->a + bs2 * bi[i];
189:     pj = b->j + bi[i];
190:     nz = bi[i + 1] - bi[i];
191:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

193:     /* Mark diagonal and invert diagonal for simpler triangular solves */
194:     pv = b->a + bs2 * bdiag[i];
195:     pj = b->j + bdiag[i];
196:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197:     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

200:     /* U part */
201:     /*
202:     pv = b->a + bs2*bi[2*n-i];
203:     pj = b->j + bi[2*n-i];
204:     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205:     */
206:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207:     pj = b->j + bdiag[i + 1] + 1;
208:     nz = bdiag[i] - bdiag[i + 1] - 1;
209:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210:   }
211:   PetscCall(PetscFree2(rtmp, mwork));

213:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217:   C->assembled           = PETSC_TRUE;

219:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220:   PetscFunctionReturn(PETSC_SUCCESS);
221: }

223: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224: {
225:   Mat             C = B;
226:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227:   IS              isrow = b->row, isicol = b->icol;
228:   const PetscInt *r, *ic;
229:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230:   PetscInt       *ajtmpold, *ajtmp, nz, row;
231:   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
232:   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
233:   MatScalar       p1, p2, p3, p4;
234:   MatScalar      *ba = b->a, *aa = a->a;
235:   PetscReal       shift = info->shiftamount;
236:   PetscBool       allowzeropivot, zeropivotdetected;

238:   PetscFunctionBegin;
239:   allowzeropivot = PetscNot(A->erroriffailure);
240:   PetscCall(ISGetIndices(isrow, &r));
241:   PetscCall(ISGetIndices(isicol, &ic));
242:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));

244:   for (i = 0; i < n; i++) {
245:     nz    = bi[i + 1] - bi[i];
246:     ajtmp = bj + bi[i];
247:     for (j = 0; j < nz; j++) {
248:       x    = rtmp + 4 * ajtmp[j];
249:       x[0] = x[1] = x[2] = x[3] = 0.0;
250:     }
251:     /* load in initial (unfactored row) */
252:     idx      = r[i];
253:     nz       = ai[idx + 1] - ai[idx];
254:     ajtmpold = aj + ai[idx];
255:     v        = aa + 4 * ai[idx];
256:     for (j = 0; j < nz; j++) {
257:       x    = rtmp + 4 * ic[ajtmpold[j]];
258:       x[0] = v[0];
259:       x[1] = v[1];
260:       x[2] = v[2];
261:       x[3] = v[3];
262:       v += 4;
263:     }
264:     row = *ajtmp++;
265:     while (row < i) {
266:       pc = rtmp + 4 * row;
267:       p1 = pc[0];
268:       p2 = pc[1];
269:       p3 = pc[2];
270:       p4 = pc[3];
271:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
272:         pv    = ba + 4 * diag_offset[row];
273:         pj    = bj + diag_offset[row] + 1;
274:         x1    = pv[0];
275:         x2    = pv[1];
276:         x3    = pv[2];
277:         x4    = pv[3];
278:         pc[0] = m1 = p1 * x1 + p3 * x2;
279:         pc[1] = m2 = p2 * x1 + p4 * x2;
280:         pc[2] = m3 = p1 * x3 + p3 * x4;
281:         pc[3] = m4 = p2 * x3 + p4 * x4;
282:         nz         = bi[row + 1] - diag_offset[row] - 1;
283:         pv += 4;
284:         for (j = 0; j < nz; j++) {
285:           x1 = pv[0];
286:           x2 = pv[1];
287:           x3 = pv[2];
288:           x4 = pv[3];
289:           x  = rtmp + 4 * pj[j];
290:           x[0] -= m1 * x1 + m3 * x2;
291:           x[1] -= m2 * x1 + m4 * x2;
292:           x[2] -= m1 * x3 + m3 * x4;
293:           x[3] -= m2 * x3 + m4 * x4;
294:           pv += 4;
295:         }
296:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
297:       }
298:       row = *ajtmp++;
299:     }
300:     /* finished row so stick it into b->a */
301:     pv = ba + 4 * bi[i];
302:     pj = bj + bi[i];
303:     nz = bi[i + 1] - bi[i];
304:     for (j = 0; j < nz; j++) {
305:       x     = rtmp + 4 * pj[j];
306:       pv[0] = x[0];
307:       pv[1] = x[1];
308:       pv[2] = x[2];
309:       pv[3] = x[3];
310:       pv += 4;
311:     }
312:     /* invert diagonal block */
313:     w = ba + 4 * diag_offset[i];
314:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
315:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316:   }

318:   PetscCall(PetscFree(rtmp));
319:   PetscCall(ISRestoreIndices(isicol, &ic));
320:   PetscCall(ISRestoreIndices(isrow, &r));

322:   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
323:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
324:   C->assembled           = PETSC_TRUE;

326:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
327:   PetscFunctionReturn(PETSC_SUCCESS);
328: }
329: /*
330:       Version for when blocks are 2 by 2 Using natural ordering
331: */
332: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
333: {
334:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
335:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
336:   PetscInt    *ajtmpold, *ajtmp, nz, row;
337:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
338:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
339:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
340:   MatScalar   *ba = b->a, *aa = a->a;
341:   PetscReal    shift = info->shiftamount;
342:   PetscBool    allowzeropivot, zeropivotdetected;

344:   PetscFunctionBegin;
345:   allowzeropivot = PetscNot(A->erroriffailure);
346:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
347:   for (i = 0; i < n; i++) {
348:     nz    = bi[i + 1] - bi[i];
349:     ajtmp = bj + bi[i];
350:     for (j = 0; j < nz; j++) {
351:       x    = rtmp + 4 * ajtmp[j];
352:       x[0] = x[1] = x[2] = x[3] = 0.0;
353:     }
354:     /* load in initial (unfactored row) */
355:     nz       = ai[i + 1] - ai[i];
356:     ajtmpold = aj + ai[i];
357:     v        = aa + 4 * ai[i];
358:     for (j = 0; j < nz; j++) {
359:       x    = rtmp + 4 * ajtmpold[j];
360:       x[0] = v[0];
361:       x[1] = v[1];
362:       x[2] = v[2];
363:       x[3] = v[3];
364:       v += 4;
365:     }
366:     row = *ajtmp++;
367:     while (row < i) {
368:       pc = rtmp + 4 * row;
369:       p1 = pc[0];
370:       p2 = pc[1];
371:       p3 = pc[2];
372:       p4 = pc[3];
373:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
374:         pv    = ba + 4 * diag_offset[row];
375:         pj    = bj + diag_offset[row] + 1;
376:         x1    = pv[0];
377:         x2    = pv[1];
378:         x3    = pv[2];
379:         x4    = pv[3];
380:         pc[0] = m1 = p1 * x1 + p3 * x2;
381:         pc[1] = m2 = p2 * x1 + p4 * x2;
382:         pc[2] = m3 = p1 * x3 + p3 * x4;
383:         pc[3] = m4 = p2 * x3 + p4 * x4;
384:         nz         = bi[row + 1] - diag_offset[row] - 1;
385:         pv += 4;
386:         for (j = 0; j < nz; j++) {
387:           x1 = pv[0];
388:           x2 = pv[1];
389:           x3 = pv[2];
390:           x4 = pv[3];
391:           x  = rtmp + 4 * pj[j];
392:           x[0] -= m1 * x1 + m3 * x2;
393:           x[1] -= m2 * x1 + m4 * x2;
394:           x[2] -= m1 * x3 + m3 * x4;
395:           x[3] -= m2 * x3 + m4 * x4;
396:           pv += 4;
397:         }
398:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
399:       }
400:       row = *ajtmp++;
401:     }
402:     /* finished row so stick it into b->a */
403:     pv = ba + 4 * bi[i];
404:     pj = bj + bi[i];
405:     nz = bi[i + 1] - bi[i];
406:     for (j = 0; j < nz; j++) {
407:       x     = rtmp + 4 * pj[j];
408:       pv[0] = x[0];
409:       pv[1] = x[1];
410:       pv[2] = x[2];
411:       pv[3] = x[3];
412:       /*
413:       printf(" col %d:",pj[j]);
414:       PetscInt j1;
415:       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
416:       printf("\n");
417:       */
418:       pv += 4;
419:     }
420:     /* invert diagonal block */
421:     w = ba + 4 * diag_offset[i];
422:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
423:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
424:   }

426:   PetscCall(PetscFree(rtmp));

428:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
429:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
430:   C->assembled           = PETSC_TRUE;

432:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
433:   PetscFunctionReturn(PETSC_SUCCESS);
434: }

436: /*
437:      Version for when blocks are 1 by 1.
438: */
439: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
440: {
441:   Mat              C = B;
442:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
443:   IS               isrow = b->row, isicol = b->icol;
444:   const PetscInt  *r, *ic, *ics;
445:   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
446:   PetscInt         i, j, k, nz, nzL, row, *pj;
447:   const PetscInt  *ajtmp, *bjtmp;
448:   MatScalar       *rtmp, *pc, multiplier, *pv;
449:   const MatScalar *aa = a->a, *v;
450:   PetscBool        row_identity, col_identity;
451:   FactorShiftCtx   sctx;
452:   const PetscInt  *ddiag;
453:   PetscReal        rs;
454:   MatScalar        d;

456:   PetscFunctionBegin;
457:   /* MatPivotSetUp(): initialize shift context sctx */
458:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

460:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
461:     ddiag          = a->diag;
462:     sctx.shift_top = info->zeropivot;
463:     for (i = 0; i < n; i++) {
464:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
465:       d  = (aa)[ddiag[i]];
466:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
467:       v  = aa + ai[i];
468:       nz = ai[i + 1] - ai[i];
469:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
470:       if (rs > sctx.shift_top) sctx.shift_top = rs;
471:     }
472:     sctx.shift_top *= 1.1;
473:     sctx.nshift_max = 5;
474:     sctx.shift_lo   = 0.;
475:     sctx.shift_hi   = 1.;
476:   }

478:   PetscCall(ISGetIndices(isrow, &r));
479:   PetscCall(ISGetIndices(isicol, &ic));
480:   PetscCall(PetscMalloc1(n + 1, &rtmp));
481:   ics = ic;

483:   do {
484:     sctx.newshift = PETSC_FALSE;
485:     for (i = 0; i < n; i++) {
486:       /* zero rtmp */
487:       /* L part */
488:       nz    = bi[i + 1] - bi[i];
489:       bjtmp = bj + bi[i];
490:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

492:       /* U part */
493:       nz    = bdiag[i] - bdiag[i + 1];
494:       bjtmp = bj + bdiag[i + 1] + 1;
495:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

497:       /* load in initial (unfactored row) */
498:       nz    = ai[r[i] + 1] - ai[r[i]];
499:       ajtmp = aj + ai[r[i]];
500:       v     = aa + ai[r[i]];
501:       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];

503:       /* ZeropivotApply() */
504:       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

506:       /* elimination */
507:       bjtmp = bj + bi[i];
508:       row   = *bjtmp++;
509:       nzL   = bi[i + 1] - bi[i];
510:       for (k = 0; k < nzL; k++) {
511:         pc = rtmp + row;
512:         if (*pc != (PetscScalar)0.0) {
513:           pv         = b->a + bdiag[row];
514:           multiplier = *pc * (*pv);
515:           *pc        = multiplier;

517:           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
518:           pv = b->a + bdiag[row + 1] + 1;
519:           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
520:           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
521:           PetscCall(PetscLogFlops(2.0 * nz));
522:         }
523:         row = *bjtmp++;
524:       }

526:       /* finished row so stick it into b->a */
527:       rs = 0.0;
528:       /* L part */
529:       pv = b->a + bi[i];
530:       pj = b->j + bi[i];
531:       nz = bi[i + 1] - bi[i];
532:       for (j = 0; j < nz; j++) {
533:         pv[j] = rtmp[pj[j]];
534:         rs += PetscAbsScalar(pv[j]);
535:       }

537:       /* U part */
538:       pv = b->a + bdiag[i + 1] + 1;
539:       pj = b->j + bdiag[i + 1] + 1;
540:       nz = bdiag[i] - bdiag[i + 1] - 1;
541:       for (j = 0; j < nz; j++) {
542:         pv[j] = rtmp[pj[j]];
543:         rs += PetscAbsScalar(pv[j]);
544:       }

546:       sctx.rs = rs;
547:       sctx.pv = rtmp[i];
548:       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
549:       if (sctx.newshift) break; /* break for-loop */
550:       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

552:       /* Mark diagonal and invert diagonal for simpler triangular solves */
553:       pv  = b->a + bdiag[i];
554:       *pv = (PetscScalar)1.0 / rtmp[i];

556:     } /* endof for (i=0; i<n; i++) { */

558:     /* MatPivotRefine() */
559:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
560:       /*
561:        * if no shift in this attempt & shifting & started shifting & can refine,
562:        * then try lower shift
563:        */
564:       sctx.shift_hi       = sctx.shift_fraction;
565:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
566:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
567:       sctx.newshift       = PETSC_TRUE;
568:       sctx.nshift++;
569:     }
570:   } while (sctx.newshift);

572:   PetscCall(PetscFree(rtmp));
573:   PetscCall(ISRestoreIndices(isicol, &ic));
574:   PetscCall(ISRestoreIndices(isrow, &r));

576:   PetscCall(ISIdentity(isrow, &row_identity));
577:   PetscCall(ISIdentity(isicol, &col_identity));
578:   if (row_identity && col_identity) {
579:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
580:     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
581:     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
582:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
583:   } else {
584:     C->ops->solve          = MatSolve_SeqBAIJ_1;
585:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
586:   }
587:   C->assembled = PETSC_TRUE;
588:   PetscCall(PetscLogFlops(C->cmap->n));

590:   /* MatShiftView(A,info,&sctx) */
591:   if (sctx.nshift) {
592:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
593:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
594:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
595:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
596:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
597:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
598:     }
599:   }
600:   PetscFunctionReturn(PETSC_SUCCESS);
601: }

603: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
604: {
605:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
606:   IS              isrow = b->row, isicol = b->icol;
607:   const PetscInt *r, *ic;
608:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
609:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
610:   PetscInt       *diag_offset = b->diag, diag, *pj;
611:   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
612:   MatScalar      *ba = b->a, *aa = a->a;
613:   PetscBool       row_identity, col_identity;

615:   PetscFunctionBegin;
616:   PetscCall(ISGetIndices(isrow, &r));
617:   PetscCall(ISGetIndices(isicol, &ic));
618:   PetscCall(PetscMalloc1(n + 1, &rtmp));

620:   for (i = 0; i < n; i++) {
621:     nz    = bi[i + 1] - bi[i];
622:     ajtmp = bj + bi[i];
623:     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;

625:     /* load in initial (unfactored row) */
626:     nz       = ai[r[i] + 1] - ai[r[i]];
627:     ajtmpold = aj + ai[r[i]];
628:     v        = aa + ai[r[i]];
629:     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];

631:     row = *ajtmp++;
632:     while (row < i) {
633:       pc = rtmp + row;
634:       if (*pc != 0.0) {
635:         pv         = ba + diag_offset[row];
636:         pj         = bj + diag_offset[row] + 1;
637:         multiplier = *pc * *pv++;
638:         *pc        = multiplier;
639:         nz         = bi[row + 1] - diag_offset[row] - 1;
640:         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
641:         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
642:       }
643:       row = *ajtmp++;
644:     }
645:     /* finished row so stick it into b->a */
646:     pv = ba + bi[i];
647:     pj = bj + bi[i];
648:     nz = bi[i + 1] - bi[i];
649:     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
650:     diag = diag_offset[i] - bi[i];
651:     /* check pivot entry for current row */
652:     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
653:     pv[diag] = 1.0 / pv[diag];
654:   }

656:   PetscCall(PetscFree(rtmp));
657:   PetscCall(ISRestoreIndices(isicol, &ic));
658:   PetscCall(ISRestoreIndices(isrow, &r));
659:   PetscCall(ISIdentity(isrow, &row_identity));
660:   PetscCall(ISIdentity(isicol, &col_identity));
661:   if (row_identity && col_identity) {
662:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
663:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
664:   } else {
665:     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
666:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
667:   }
668:   C->assembled = PETSC_TRUE;
669:   PetscCall(PetscLogFlops(C->cmap->n));
670:   PetscFunctionReturn(PETSC_SUCCESS);
671: }

673: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
674: {
675:   PetscFunctionBegin;
676:   *type = MATSOLVERPETSC;
677:   PetscFunctionReturn(PETSC_SUCCESS);
678: }

680: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
681: {
682:   PetscInt n = A->rmap->n;

684:   PetscFunctionBegin;
685: #if defined(PETSC_USE_COMPLEX)
686:   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
687: #endif
688:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
689:   PetscCall(MatSetSizes(*B, n, n, n, n));
690:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
691:     PetscCall(MatSetType(*B, MATSEQBAIJ));

693:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
694:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
695:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
696:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
697:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
698:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
699:     PetscCall(MatSetType(*B, MATSEQSBAIJ));
700:     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));

702:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
703:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
704:     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
705:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
706:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
707:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
708:   (*B)->factortype     = ftype;
709:   (*B)->canuseordering = PETSC_TRUE;

711:   PetscCall(PetscFree((*B)->solvertype));
712:   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
713:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
714:   PetscFunctionReturn(PETSC_SUCCESS);
715: }

717: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
718: {
719:   Mat C;

721:   PetscFunctionBegin;
722:   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
723:   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
724:   PetscCall(MatLUFactorNumeric(C, A, info));

726:   A->ops->solve          = C->ops->solve;
727:   A->ops->solvetranspose = C->ops->solvetranspose;

729:   PetscCall(MatHeaderMerge(A, &C));
730:   PetscFunctionReturn(PETSC_SUCCESS);
731: }

733: #include <../src/mat/impls/sbaij/seq/sbaij.h>
734: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
735: {
736:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
737:   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
738:   IS              ip = b->row;
739:   const PetscInt *rip;
740:   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
741:   PetscInt       *ai = a->i, *aj = a->j;
742:   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
743:   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
744:   PetscReal       rs;
745:   FactorShiftCtx  sctx;

747:   PetscFunctionBegin;
748:   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
749:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
750:     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
751:     PetscCall(MatDestroy(&a->sbaijMat));
752:     PetscFunctionReturn(PETSC_SUCCESS);
753:   }

755:   /* MatPivotSetUp(): initialize shift context sctx */
756:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

758:   PetscCall(ISGetIndices(ip, &rip));
759:   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

761:   sctx.shift_amount = 0.;
762:   sctx.nshift       = 0;
763:   do {
764:     sctx.newshift = PETSC_FALSE;
765:     for (i = 0; i < mbs; i++) {
766:       rtmp[i] = 0.0;
767:       jl[i]   = mbs;
768:       il[0]   = 0;
769:     }

771:     for (k = 0; k < mbs; k++) {
772:       bval = ba + bi[k];
773:       /* initialize k-th row by the perm[k]-th row of A */
774:       jmin = ai[rip[k]];
775:       jmax = ai[rip[k] + 1];
776:       for (j = jmin; j < jmax; j++) {
777:         col = rip[aj[j]];
778:         if (col >= k) { /* only take upper triangular entry */
779:           rtmp[col] = aa[j];
780:           *bval++   = 0.0; /* for in-place factorization */
781:         }
782:       }

784:       /* shift the diagonal of the matrix */
785:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

787:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
788:       dk = rtmp[k];
789:       i  = jl[k]; /* first row to be added to k_th row  */

791:       while (i < k) {
792:         nexti = jl[i]; /* next row to be added to k_th row */

794:         /* compute multiplier, update diag(k) and U(i,k) */
795:         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
796:         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
797:         dk += uikdi * ba[ili];
798:         ba[ili] = uikdi; /* -U(i,k) */

800:         /* add multiple of row i to k-th row */
801:         jmin = ili + 1;
802:         jmax = bi[i + 1];
803:         if (jmin < jmax) {
804:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
805:           /* update il and jl for row i */
806:           il[i] = jmin;
807:           j     = bj[jmin];
808:           jl[i] = jl[j];
809:           jl[j] = i;
810:         }
811:         i = nexti;
812:       }

814:       /* shift the diagonals when zero pivot is detected */
815:       /* compute rs=sum of abs(off-diagonal) */
816:       rs   = 0.0;
817:       jmin = bi[k] + 1;
818:       nz   = bi[k + 1] - jmin;
819:       if (nz) {
820:         bcol = bj + jmin;
821:         while (nz--) {
822:           rs += PetscAbsScalar(rtmp[*bcol]);
823:           bcol++;
824:         }
825:       }

827:       sctx.rs = rs;
828:       sctx.pv = dk;
829:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
830:       if (sctx.newshift) break;
831:       dk = sctx.pv;

833:       /* copy data into U(k,:) */
834:       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
835:       jmin      = bi[k] + 1;
836:       jmax      = bi[k + 1];
837:       if (jmin < jmax) {
838:         for (j = jmin; j < jmax; j++) {
839:           col       = bj[j];
840:           ba[j]     = rtmp[col];
841:           rtmp[col] = 0.0;
842:         }
843:         /* add the k-th row into il and jl */
844:         il[k] = jmin;
845:         i     = bj[jmin];
846:         jl[k] = jl[i];
847:         jl[i] = k;
848:       }
849:     }
850:   } while (sctx.newshift);
851:   PetscCall(PetscFree3(rtmp, il, jl));

853:   PetscCall(ISRestoreIndices(ip, &rip));

855:   C->assembled    = PETSC_TRUE;
856:   C->preallocated = PETSC_TRUE;

858:   PetscCall(PetscLogFlops(C->rmap->N));
859:   if (sctx.nshift) {
860:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
861:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
862:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
863:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
864:     }
865:   }
866:   PetscFunctionReturn(PETSC_SUCCESS);
867: }

869: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
870: {
871:   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
872:   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
873:   PetscInt       i, j, am = a->mbs;
874:   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
875:   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
876:   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
877:   PetscReal      rs;
878:   FactorShiftCtx sctx;

880:   PetscFunctionBegin;
881:   /* MatPivotSetUp(): initialize shift context sctx */
882:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

884:   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));

886:   do {
887:     sctx.newshift = PETSC_FALSE;
888:     for (i = 0; i < am; i++) {
889:       rtmp[i] = 0.0;
890:       jl[i]   = am;
891:       il[0]   = 0;
892:     }

894:     for (k = 0; k < am; k++) {
895:       /* initialize k-th row with elements nonzero in row perm(k) of A */
896:       nz   = ai[k + 1] - ai[k];
897:       acol = aj + ai[k];
898:       aval = aa + ai[k];
899:       bval = ba + bi[k];
900:       while (nz--) {
901:         if (*acol < k) { /* skip lower triangular entries */
902:           acol++;
903:           aval++;
904:         } else {
905:           rtmp[*acol++] = *aval++;
906:           *bval++       = 0.0; /* for in-place factorization */
907:         }
908:       }

910:       /* shift the diagonal of the matrix */
911:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

913:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
914:       dk = rtmp[k];
915:       i  = jl[k]; /* first row to be added to k_th row  */

917:       while (i < k) {
918:         nexti = jl[i]; /* next row to be added to k_th row */
919:         /* compute multiplier, update D(k) and U(i,k) */
920:         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
921:         uikdi = -ba[ili] * ba[bi[i]];
922:         dk += uikdi * ba[ili];
923:         ba[ili] = uikdi; /* -U(i,k) */

925:         /* add multiple of row i to k-th row ... */
926:         jmin = ili + 1;
927:         nz   = bi[i + 1] - jmin;
928:         if (nz > 0) {
929:           bcol = bj + jmin;
930:           bval = ba + jmin;
931:           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
932:           /* update il and jl for i-th row */
933:           il[i] = jmin;
934:           j     = bj[jmin];
935:           jl[i] = jl[j];
936:           jl[j] = i;
937:         }
938:         i = nexti;
939:       }

941:       /* shift the diagonals when zero pivot is detected */
942:       /* compute rs=sum of abs(off-diagonal) */
943:       rs   = 0.0;
944:       jmin = bi[k] + 1;
945:       nz   = bi[k + 1] - jmin;
946:       if (nz) {
947:         bcol = bj + jmin;
948:         while (nz--) {
949:           rs += PetscAbsScalar(rtmp[*bcol]);
950:           bcol++;
951:         }
952:       }

954:       sctx.rs = rs;
955:       sctx.pv = dk;
956:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
957:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
958:       dk = sctx.pv;

960:       /* copy data into U(k,:) */
961:       ba[bi[k]] = 1.0 / dk;
962:       jmin      = bi[k] + 1;
963:       nz        = bi[k + 1] - jmin;
964:       if (nz) {
965:         bcol = bj + jmin;
966:         bval = ba + jmin;
967:         while (nz--) {
968:           *bval++       = rtmp[*bcol];
969:           rtmp[*bcol++] = 0.0;
970:         }
971:         /* add k-th row into il and jl */
972:         il[k] = jmin;
973:         i     = bj[jmin];
974:         jl[k] = jl[i];
975:         jl[i] = k;
976:       }
977:     }
978:   } while (sctx.newshift);
979:   PetscCall(PetscFree3(rtmp, il, jl));

981:   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
982:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983:   C->assembled           = PETSC_TRUE;
984:   C->preallocated        = PETSC_TRUE;

986:   PetscCall(PetscLogFlops(C->rmap->N));
987:   if (sctx.nshift) {
988:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
989:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
990:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
991:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
992:     }
993:   }
994:   PetscFunctionReturn(PETSC_SUCCESS);
995: }

997: #include <petscbt.h>
998: #include <../src/mat/utils/freespace.h>
999: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1000: {
1001:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1002:   Mat_SeqSBAIJ      *b;
1003:   Mat                B;
1004:   PetscBool          perm_identity, missing;
1005:   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1006:   const PetscInt    *rip;
1007:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1008:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1009:   PetscReal          fill = info->fill, levels = info->levels;
1010:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1011:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1012:   PetscBT            lnkbt;

1014:   PetscFunctionBegin;
1015:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1016:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);

1018:   if (bs > 1) {
1019:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1020:     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1022:     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1023:     PetscFunctionReturn(PETSC_SUCCESS);
1024:   }

1026:   PetscCall(ISIdentity(perm, &perm_identity));
1027:   PetscCall(ISGetIndices(perm, &rip));

1029:   /* special case that simply copies fill pattern */
1030:   if (!levels && perm_identity) {
1031:     PetscCall(PetscMalloc1(am + 1, &ui));
1032:     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1033:     B = fact;
1034:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));

1036:     b  = (Mat_SeqSBAIJ *)B->data;
1037:     uj = b->j;
1038:     for (i = 0; i < am; i++) {
1039:       aj = a->j + a->diag[i];
1040:       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1041:       b->ilen[i] = ui[i];
1042:     }
1043:     PetscCall(PetscFree(ui));

1045:     B->factortype = MAT_FACTOR_NONE;

1047:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1048:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1049:     B->factortype = MAT_FACTOR_ICC;

1051:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1052:     PetscFunctionReturn(PETSC_SUCCESS);
1053:   }

1055:   /* initialization */
1056:   PetscCall(PetscMalloc1(am + 1, &ui));
1057:   ui[0] = 0;
1058:   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));

1060:   /* jl: linked list for storing indices of the pivot rows
1061:      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1062:   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1063:   for (i = 0; i < am; i++) {
1064:     jl[i] = am;
1065:     il[i] = 0;
1066:   }

1068:   /* create and initialize a linked list for storing column indices of the active row k */
1069:   nlnk = am + 1;
1070:   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

1072:   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1073:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));

1075:   current_space = free_space;

1077:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1078:   current_space_lvl = free_space_lvl;

1080:   for (k = 0; k < am; k++) { /* for each active row k */
1081:     /* initialize lnk by the column indices of row rip[k] of A */
1082:     nzk         = 0;
1083:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1084:     ncols_upper = 0;
1085:     cols        = cols_lvl + am;
1086:     for (j = 0; j < ncols; j++) {
1087:       i = rip[*(aj + ai[rip[k]] + j)];
1088:       if (i >= k) { /* only take upper triangular entry */
1089:         cols[ncols_upper]     = i;
1090:         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1091:         ncols_upper++;
1092:       }
1093:     }
1094:     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1095:     nzk += nlnk;

1097:     /* update lnk by computing fill-in for each pivot row to be merged in */
1098:     prow = jl[k]; /* 1st pivot row */

1100:     while (prow < k) {
1101:       nextprow = jl[prow];

1103:       /* merge prow into k-th row */
1104:       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1105:       jmax  = ui[prow + 1];
1106:       ncols = jmax - jmin;
1107:       i     = jmin - ui[prow];
1108:       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1109:       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1110:       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1111:       nzk += nlnk;

1113:       /* update il and jl for prow */
1114:       if (jmin < jmax) {
1115:         il[prow] = jmin;

1117:         j        = *cols;
1118:         jl[prow] = jl[j];
1119:         jl[j]    = prow;
1120:       }
1121:       prow = nextprow;
1122:     }

1124:     /* if free space is not available, make more free space */
1125:     if (current_space->local_remaining < nzk) {
1126:       i = am - k + 1;                                                             /* num of unfactored rows */
1127:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1128:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1129:       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1130:       reallocs++;
1131:     }

1133:     /* copy data into free_space and free_space_lvl, then initialize lnk */
1134:     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

1136:     /* add the k-th row into il and jl */
1137:     if (nzk - 1 > 0) {
1138:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1139:       jl[k] = jl[i];
1140:       jl[i] = k;
1141:       il[k] = ui[k] + 1;
1142:     }
1143:     uj_ptr[k]     = current_space->array;
1144:     uj_lvl_ptr[k] = current_space_lvl->array;

1146:     current_space->array += nzk;
1147:     current_space->local_used += nzk;
1148:     current_space->local_remaining -= nzk;

1150:     current_space_lvl->array += nzk;
1151:     current_space_lvl->local_used += nzk;
1152:     current_space_lvl->local_remaining -= nzk;

1154:     ui[k + 1] = ui[k] + nzk;
1155:   }

1157:   PetscCall(ISRestoreIndices(perm, &rip));
1158:   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1159:   PetscCall(PetscFree(cols_lvl));

1161:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1162:   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1163:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1164:   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1165:   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

1167:   /* put together the new matrix in MATSEQSBAIJ format */
1168:   B = fact;
1169:   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));

1171:   b               = (Mat_SeqSBAIJ *)B->data;
1172:   b->singlemalloc = PETSC_FALSE;
1173:   b->free_a       = PETSC_TRUE;
1174:   b->free_ij      = PETSC_TRUE;

1176:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

1178:   b->j             = uj;
1179:   b->i             = ui;
1180:   b->diag          = NULL;
1181:   b->ilen          = NULL;
1182:   b->imax          = NULL;
1183:   b->row           = perm;
1184:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1186:   PetscCall(PetscObjectReference((PetscObject)perm));

1188:   b->icol = perm;

1190:   PetscCall(PetscObjectReference((PetscObject)perm));
1191:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

1193:   b->maxnz = b->nz = ui[am];

1195:   B->info.factor_mallocs   = reallocs;
1196:   B->info.fill_ratio_given = fill;
1197:   if (ai[am] != 0.) {
1198:     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1199:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1200:   } else {
1201:     B->info.fill_ratio_needed = 0.0;
1202:   }
1203: #if defined(PETSC_USE_INFO)
1204:   if (ai[am] != 0) {
1205:     PetscReal af = B->info.fill_ratio_needed;
1206:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1207:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1208:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1209:   } else {
1210:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1211:   }
1212: #endif
1213:   if (perm_identity) {
1214:     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1215:     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1217:   } else {
1218:     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1219:   }
1220:   PetscFunctionReturn(PETSC_SUCCESS);
1221: }

1223: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1224: {
1225:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1226:   Mat_SeqSBAIJ      *b;
1227:   Mat                B;
1228:   PetscBool          perm_identity, missing;
1229:   PetscReal          fill = info->fill;
1230:   const PetscInt    *rip;
1231:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1232:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1233:   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1234:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1235:   PetscBT            lnkbt;

1237:   PetscFunctionBegin;
1238:   if (bs > 1) { /* convert to seqsbaij */
1239:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1240:     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1242:     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1243:     PetscFunctionReturn(PETSC_SUCCESS);
1244:   }

1246:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1247:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);

1249:   /* check whether perm is the identity mapping */
1250:   PetscCall(ISIdentity(perm, &perm_identity));
1251:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1252:   PetscCall(ISGetIndices(perm, &rip));

1254:   /* initialization */
1255:   PetscCall(PetscMalloc1(mbs + 1, &ui));
1256:   ui[0] = 0;

1258:   /* jl: linked list for storing indices of the pivot rows
1259:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1260:   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1261:   for (i = 0; i < mbs; i++) {
1262:     jl[i] = mbs;
1263:     il[i] = 0;
1264:   }

1266:   /* create and initialize a linked list for storing column indices of the active row k */
1267:   nlnk = mbs + 1;
1268:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

1270:   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1271:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));

1273:   current_space = free_space;

1275:   for (k = 0; k < mbs; k++) { /* for each active row k */
1276:     /* initialize lnk by the column indices of row rip[k] of A */
1277:     nzk         = 0;
1278:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1279:     ncols_upper = 0;
1280:     for (j = 0; j < ncols; j++) {
1281:       i = rip[*(aj + ai[rip[k]] + j)];
1282:       if (i >= k) { /* only take upper triangular entry */
1283:         cols[ncols_upper] = i;
1284:         ncols_upper++;
1285:       }
1286:     }
1287:     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1288:     nzk += nlnk;

1290:     /* update lnk by computing fill-in for each pivot row to be merged in */
1291:     prow = jl[k]; /* 1st pivot row */

1293:     while (prow < k) {
1294:       nextprow = jl[prow];
1295:       /* merge prow into k-th row */
1296:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1297:       jmax   = ui[prow + 1];
1298:       ncols  = jmax - jmin;
1299:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1300:       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1301:       nzk += nlnk;

1303:       /* update il and jl for prow */
1304:       if (jmin < jmax) {
1305:         il[prow] = jmin;
1306:         j        = *uj_ptr;
1307:         jl[prow] = jl[j];
1308:         jl[j]    = prow;
1309:       }
1310:       prow = nextprow;
1311:     }

1313:     /* if free space is not available, make more free space */
1314:     if (current_space->local_remaining < nzk) {
1315:       i = mbs - k + 1;                                                            /* num of unfactored rows */
1316:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1317:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1318:       reallocs++;
1319:     }

1321:     /* copy data into free space, then initialize lnk */
1322:     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

1324:     /* add the k-th row into il and jl */
1325:     if (nzk - 1 > 0) {
1326:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1327:       jl[k] = jl[i];
1328:       jl[i] = k;
1329:       il[k] = ui[k] + 1;
1330:     }
1331:     ui_ptr[k] = current_space->array;
1332:     current_space->array += nzk;
1333:     current_space->local_used += nzk;
1334:     current_space->local_remaining -= nzk;

1336:     ui[k + 1] = ui[k] + nzk;
1337:   }

1339:   PetscCall(ISRestoreIndices(perm, &rip));
1340:   PetscCall(PetscFree4(ui_ptr, il, jl, cols));

1342:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1343:   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1344:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1345:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1347:   /* put together the new matrix in MATSEQSBAIJ format */
1348:   B = fact;
1349:   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

1351:   b               = (Mat_SeqSBAIJ *)B->data;
1352:   b->singlemalloc = PETSC_FALSE;
1353:   b->free_a       = PETSC_TRUE;
1354:   b->free_ij      = PETSC_TRUE;

1356:   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

1358:   b->j             = uj;
1359:   b->i             = ui;
1360:   b->diag          = NULL;
1361:   b->ilen          = NULL;
1362:   b->imax          = NULL;
1363:   b->row           = perm;
1364:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1366:   PetscCall(PetscObjectReference((PetscObject)perm));
1367:   b->icol = perm;
1368:   PetscCall(PetscObjectReference((PetscObject)perm));
1369:   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1370:   b->maxnz = b->nz = ui[mbs];

1372:   B->info.factor_mallocs   = reallocs;
1373:   B->info.fill_ratio_given = fill;
1374:   if (ai[mbs] != 0.) {
1375:     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1376:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1377:   } else {
1378:     B->info.fill_ratio_needed = 0.0;
1379:   }
1380: #if defined(PETSC_USE_INFO)
1381:   if (ai[mbs] != 0.) {
1382:     PetscReal af = B->info.fill_ratio_needed;
1383:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1384:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1385:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1386:   } else {
1387:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1388:   }
1389: #endif
1390:   if (perm_identity) {
1391:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1392:   } else {
1393:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1394:   }
1395:   PetscFunctionReturn(PETSC_SUCCESS);
1396: }

1398: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1399: {
1400:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1401:   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1402:   PetscInt           i, k, n                       = a->mbs;
1403:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1404:   const MatScalar   *aa = a->a, *v;
1405:   PetscScalar       *x, *s, *t, *ls;
1406:   const PetscScalar *b;

1408:   PetscFunctionBegin;
1409:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1410:   PetscCall(VecGetArrayRead(bb, &b));
1411:   PetscCall(VecGetArray(xx, &x));
1412:   t = a->solve_work;

1414:   /* forward solve the lower triangular */
1415:   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */

1417:   for (i = 1; i < n; i++) {
1418:     v  = aa + bs2 * ai[i];
1419:     vi = aj + ai[i];
1420:     nz = ai[i + 1] - ai[i];
1421:     s  = t + bs * i;
1422:     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1423:     for (k = 0; k < nz; k++) {
1424:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1425:       v += bs2;
1426:     }
1427:   }

1429:   /* backward solve the upper triangular */
1430:   ls = a->solve_work + A->cmap->n;
1431:   for (i = n - 1; i >= 0; i--) {
1432:     v  = aa + bs2 * (adiag[i + 1] + 1);
1433:     vi = aj + adiag[i + 1] + 1;
1434:     nz = adiag[i] - adiag[i + 1] - 1;
1435:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1436:     for (k = 0; k < nz; k++) {
1437:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1438:       v += bs2;
1439:     }
1440:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1441:     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1442:   }

1444:   PetscCall(VecRestoreArrayRead(bb, &b));
1445:   PetscCall(VecRestoreArray(xx, &x));
1446:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1447:   PetscFunctionReturn(PETSC_SUCCESS);
1448: }

1450: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1451: {
1452:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1453:   IS                 iscol = a->col, isrow = a->row;
1454:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1455:   PetscInt           i, m, n = a->mbs;
1456:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1457:   const MatScalar   *aa = a->a, *v;
1458:   PetscScalar       *x, *s, *t, *ls;
1459:   const PetscScalar *b;

1461:   PetscFunctionBegin;
1462:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1463:   PetscCall(VecGetArrayRead(bb, &b));
1464:   PetscCall(VecGetArray(xx, &x));
1465:   t = a->solve_work;

1467:   PetscCall(ISGetIndices(isrow, &rout));
1468:   r = rout;
1469:   PetscCall(ISGetIndices(iscol, &cout));
1470:   c = cout;

1472:   /* forward solve the lower triangular */
1473:   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1474:   for (i = 1; i < n; i++) {
1475:     v  = aa + bs2 * ai[i];
1476:     vi = aj + ai[i];
1477:     nz = ai[i + 1] - ai[i];
1478:     s  = t + bs * i;
1479:     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1480:     for (m = 0; m < nz; m++) {
1481:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1482:       v += bs2;
1483:     }
1484:   }

1486:   /* backward solve the upper triangular */
1487:   ls = a->solve_work + A->cmap->n;
1488:   for (i = n - 1; i >= 0; i--) {
1489:     v  = aa + bs2 * (adiag[i + 1] + 1);
1490:     vi = aj + adiag[i + 1] + 1;
1491:     nz = adiag[i] - adiag[i + 1] - 1;
1492:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1493:     for (m = 0; m < nz; m++) {
1494:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1495:       v += bs2;
1496:     }
1497:     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1498:     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1499:   }
1500:   PetscCall(ISRestoreIndices(isrow, &rout));
1501:   PetscCall(ISRestoreIndices(iscol, &cout));
1502:   PetscCall(VecRestoreArrayRead(bb, &b));
1503:   PetscCall(VecRestoreArray(xx, &x));
1504:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1505:   PetscFunctionReturn(PETSC_SUCCESS);
1506: }

1508: /*
1509:     For each block in an block array saves the largest absolute value in the block into another array
1510: */
1511: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1512: {
1513:   PetscInt i, j;

1515:   PetscFunctionBegin;
1516:   PetscCall(PetscArrayzero(absarray, nbs + 1));
1517:   for (i = 0; i < nbs; i++) {
1518:     for (j = 0; j < bs2; j++) {
1519:       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1520:     }
1521:   }
1522:   PetscFunctionReturn(PETSC_SUCCESS);
1523: }

1525: /*
1526:      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1527: */
1528: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1529: {
1530:   Mat             B = *fact;
1531:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1532:   IS              isicol;
1533:   const PetscInt *r, *ic;
1534:   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1535:   PetscInt       *bi, *bj, *bdiag;

1537:   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1538:   PetscInt   nlnk, *lnk;
1539:   PetscBT    lnkbt;
1540:   PetscBool  row_identity, icol_identity;
1541:   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1542:   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;

1544:   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1545:   PetscInt   nnz_max;
1546:   PetscBool  missing;
1547:   PetscReal *vtmp_abs;
1548:   MatScalar *v_work;
1549:   PetscInt  *v_pivots;
1550:   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;

1552:   PetscFunctionBegin;
1553:   /* symbolic factorization, can be reused */
1554:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1555:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1556:   adiag = a->diag;

1558:   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));

1560:   /* bdiag is location of diagonal in factor */
1561:   PetscCall(PetscMalloc1(mbs + 1, &bdiag));

1563:   /* allocate row pointers bi */
1564:   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));

1566:   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1567:   dtcount = (PetscInt)info->dtcount;
1568:   if (dtcount > mbs - 1) dtcount = mbs - 1;
1569:   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1570:   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1571:   PetscCall(PetscMalloc1(nnz_max, &bj));
1572:   nnz_max = nnz_max * bs2;
1573:   PetscCall(PetscMalloc1(nnz_max, &ba));

1575:   /* put together the new matrix */
1576:   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

1578:   b               = (Mat_SeqBAIJ *)(B)->data;
1579:   b->free_a       = PETSC_TRUE;
1580:   b->free_ij      = PETSC_TRUE;
1581:   b->singlemalloc = PETSC_FALSE;

1583:   b->a    = ba;
1584:   b->j    = bj;
1585:   b->i    = bi;
1586:   b->diag = bdiag;
1587:   b->ilen = NULL;
1588:   b->imax = NULL;
1589:   b->row  = isrow;
1590:   b->col  = iscol;

1592:   PetscCall(PetscObjectReference((PetscObject)isrow));
1593:   PetscCall(PetscObjectReference((PetscObject)iscol));

1595:   b->icol = isicol;
1596:   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1597:   b->maxnz = nnz_max / bs2;

1599:   (B)->factortype            = MAT_FACTOR_ILUDT;
1600:   (B)->info.factor_mallocs   = 0;
1601:   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1602:   /* end of symbolic factorization */
1603:   PetscCall(ISGetIndices(isrow, &r));
1604:   PetscCall(ISGetIndices(isicol, &ic));

1606:   /* linked list for storing column indices of the active row */
1607:   nlnk = mbs + 1;
1608:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

1610:   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1611:   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1612:   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1613:   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1614:   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1615:   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));

1617:   allowzeropivot  = PetscNot(A->erroriffailure);
1618:   bi[0]           = 0;
1619:   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1620:   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1621:   for (i = 0; i < mbs; i++) {
1622:     /* copy initial fill into linked list */
1623:     nzi = ai[r[i] + 1] - ai[r[i]];
1624:     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1625:     nzi_al = adiag[r[i]] - ai[r[i]];
1626:     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;

1628:     /* load in initial unfactored row */
1629:     ajtmp = aj + ai[r[i]];
1630:     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1631:     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1632:     aatmp = a->a + bs2 * ai[r[i]];
1633:     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));

1635:     /* add pivot rows into linked list */
1636:     row = lnk[mbs];
1637:     while (row < i) {
1638:       nzi_bl = bi[row + 1] - bi[row] + 1;
1639:       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1640:       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1641:       nzi += nlnk;
1642:       row = lnk[row];
1643:     }

1645:     /* copy data from lnk into jtmp, then initialize lnk */
1646:     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));

1648:     /* numerical factorization */
1649:     bjtmp = jtmp;
1650:     row   = *bjtmp++; /* 1st pivot row */

1652:     while (row < i) {
1653:       pc = rtmp + bs2 * row;
1654:       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1655:       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1656:       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1657:       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1658:         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1659:         pv = ba + bs2 * (bdiag[row + 1] + 1);
1660:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1661:         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1662:         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1663:       }
1664:       row = *bjtmp++;
1665:     }

1667:     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1668:     nzi_bl = 0;
1669:     j      = 0;
1670:     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1671:       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1672:       nzi_bl++;
1673:       j++;
1674:     }
1675:     nzi_bu = nzi - nzi_bl - 1;

1677:     while (j < nzi) { /* U-part */
1678:       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1679:       j++;
1680:     }

1682:     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));

1684:     bjtmp = bj + bi[i];
1685:     batmp = ba + bs2 * bi[i];
1686:     /* apply level dropping rule to L part */
1687:     ncut = nzi_al + dtcount;
1688:     if (ncut < nzi_bl) {
1689:       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1690:       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1691:     } else {
1692:       ncut = nzi_bl;
1693:     }
1694:     for (j = 0; j < ncut; j++) {
1695:       bjtmp[j] = jtmp[j];
1696:       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1697:     }
1698:     bi[i + 1] = bi[i] + ncut;
1699:     nzi       = ncut + 1;

1701:     /* apply level dropping rule to U part */
1702:     ncut = nzi_au + dtcount;
1703:     if (ncut < nzi_bu) {
1704:       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1705:       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1706:     } else {
1707:       ncut = nzi_bu;
1708:     }
1709:     nzi += ncut;

1711:     /* mark bdiagonal */
1712:     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1713:     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);

1715:     bjtmp = bj + bdiag[i];
1716:     batmp = ba + bs2 * bdiag[i];
1717:     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1718:     *bjtmp = i;

1720:     bjtmp = bj + bdiag[i + 1] + 1;
1721:     batmp = ba + (bdiag[i + 1] + 1) * bs2;

1723:     for (k = 0; k < ncut; k++) {
1724:       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1725:       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1726:     }

1728:     im[i] = nzi; /* used by PetscLLAddSortedLU() */

1730:     /* invert diagonal block for simpler triangular solves - add shift??? */
1731:     batmp = ba + bs2 * bdiag[i];

1733:     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1734:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1735:   } /* for (i=0; i<mbs; i++) */
1736:   PetscCall(PetscFree3(v_work, multiplier, v_pivots));

1738:   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1739:   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);

1741:   PetscCall(ISRestoreIndices(isrow, &r));
1742:   PetscCall(ISRestoreIndices(isicol, &ic));

1744:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1746:   PetscCall(PetscFree2(im, jtmp));
1747:   PetscCall(PetscFree2(rtmp, vtmp));

1749:   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1750:   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];

1752:   PetscCall(ISIdentity(isrow, &row_identity));
1753:   PetscCall(ISIdentity(isicol, &icol_identity));
1754:   if (row_identity && icol_identity) {
1755:     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1756:   } else {
1757:     B->ops->solve = MatSolve_SeqBAIJ_N;
1758:   }

1760:   B->ops->solveadd          = NULL;
1761:   B->ops->solvetranspose    = NULL;
1762:   B->ops->solvetransposeadd = NULL;
1763:   B->ops->matsolve          = NULL;
1764:   B->assembled              = PETSC_TRUE;
1765:   B->preallocated           = PETSC_TRUE;
1766:   PetscFunctionReturn(PETSC_SUCCESS);
1767: }