Actual source code: baijsolv.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  7:   IS                 iscol = a->col, isrow = a->row;
  8:   const PetscInt    *r, *c, *rout, *cout;
  9:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
 10:   PetscInt           i, nz;
 11:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 12:   const MatScalar   *aa = a->a, *v;
 13:   PetscScalar       *x, *s, *t, *ls;
 14:   const PetscScalar *b;

 16:   PetscFunctionBegin;
 17:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
 18:   PetscCall(VecGetArrayRead(bb, &b));
 19:   PetscCall(VecGetArray(xx, &x));
 20:   t = a->solve_work;

 22:   PetscCall(ISGetIndices(isrow, &rout));
 23:   r = rout;
 24:   PetscCall(ISGetIndices(iscol, &cout));
 25:   c = cout + (n - 1);

 27:   /* forward solve the lower triangular */
 28:   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
 29:   for (i = 1; i < n; i++) {
 30:     v  = aa + bs2 * ai[i];
 31:     vi = aj + ai[i];
 32:     nz = a->diag[i] - ai[i];
 33:     s  = t + bs * i;
 34:     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
 35:     while (nz--) {
 36:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
 37:       v += bs2;
 38:     }
 39:   }
 40:   /* backward solve the upper triangular */
 41:   ls = a->solve_work + A->cmap->n;
 42:   for (i = n - 1; i >= 0; i--) {
 43:     v  = aa + bs2 * (a->diag[i] + 1);
 44:     vi = aj + a->diag[i] + 1;
 45:     nz = ai[i + 1] - a->diag[i] - 1;
 46:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
 47:     while (nz--) {
 48:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
 49:       v += bs2;
 50:     }
 51:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 52:     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
 53:   }

 55:   PetscCall(ISRestoreIndices(isrow, &rout));
 56:   PetscCall(ISRestoreIndices(iscol, &cout));
 57:   PetscCall(VecRestoreArrayRead(bb, &b));
 58:   PetscCall(VecRestoreArray(xx, &x));
 59:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
 60:   PetscFunctionReturn(PETSC_SUCCESS);
 61: }

 63: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
 64: {
 65:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 66:   IS                 iscol = a->col, isrow = a->row;
 67:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
 68:   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
 69:   PetscInt           i, nz, idx, idt, idc;
 70:   const MatScalar   *aa = a->a, *v;
 71:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
 72:   const PetscScalar *b;

 74:   PetscFunctionBegin;
 75:   PetscCall(VecGetArrayRead(bb, &b));
 76:   PetscCall(VecGetArray(xx, &x));
 77:   t = a->solve_work;

 79:   PetscCall(ISGetIndices(isrow, &rout));
 80:   r = rout;
 81:   PetscCall(ISGetIndices(iscol, &cout));
 82:   c = cout + (n - 1);

 84:   /* forward solve the lower triangular */
 85:   idx  = 7 * (*r++);
 86:   t[0] = b[idx];
 87:   t[1] = b[1 + idx];
 88:   t[2] = b[2 + idx];
 89:   t[3] = b[3 + idx];
 90:   t[4] = b[4 + idx];
 91:   t[5] = b[5 + idx];
 92:   t[6] = b[6 + idx];

 94:   for (i = 1; i < n; i++) {
 95:     v   = aa + 49 * ai[i];
 96:     vi  = aj + ai[i];
 97:     nz  = diag[i] - ai[i];
 98:     idx = 7 * (*r++);
 99:     s1  = b[idx];
100:     s2  = b[1 + idx];
101:     s3  = b[2 + idx];
102:     s4  = b[3 + idx];
103:     s5  = b[4 + idx];
104:     s6  = b[5 + idx];
105:     s7  = b[6 + idx];
106:     while (nz--) {
107:       idx = 7 * (*vi++);
108:       x1  = t[idx];
109:       x2  = t[1 + idx];
110:       x3  = t[2 + idx];
111:       x4  = t[3 + idx];
112:       x5  = t[4 + idx];
113:       x6  = t[5 + idx];
114:       x7  = t[6 + idx];
115:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122:       v += 49;
123:     }
124:     idx        = 7 * i;
125:     t[idx]     = s1;
126:     t[1 + idx] = s2;
127:     t[2 + idx] = s3;
128:     t[3 + idx] = s4;
129:     t[4 + idx] = s5;
130:     t[5 + idx] = s6;
131:     t[6 + idx] = s7;
132:   }
133:   /* backward solve the upper triangular */
134:   for (i = n - 1; i >= 0; i--) {
135:     v   = aa + 49 * diag[i] + 49;
136:     vi  = aj + diag[i] + 1;
137:     nz  = ai[i + 1] - diag[i] - 1;
138:     idt = 7 * i;
139:     s1  = t[idt];
140:     s2  = t[1 + idt];
141:     s3  = t[2 + idt];
142:     s4  = t[3 + idt];
143:     s5  = t[4 + idt];
144:     s6  = t[5 + idt];
145:     s7  = t[6 + idt];
146:     while (nz--) {
147:       idx = 7 * (*vi++);
148:       x1  = t[idx];
149:       x2  = t[1 + idx];
150:       x3  = t[2 + idx];
151:       x4  = t[3 + idx];
152:       x5  = t[4 + idx];
153:       x6  = t[5 + idx];
154:       x7  = t[6 + idx];
155:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162:       v += 49;
163:     }
164:     idc    = 7 * (*c--);
165:     v      = aa + 49 * diag[i];
166:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173:   }

175:   PetscCall(ISRestoreIndices(isrow, &rout));
176:   PetscCall(ISRestoreIndices(iscol, &cout));
177:   PetscCall(VecRestoreArrayRead(bb, &b));
178:   PetscCall(VecRestoreArray(xx, &x));
179:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180:   PetscFunctionReturn(PETSC_SUCCESS);
181: }

183: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184: {
185:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
186:   IS                 iscol = a->col, isrow = a->row;
187:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188:   const PetscInt     n = a->mbs, *rout, *cout, *vi;
189:   PetscInt           i, nz, idx, idt, idc, m;
190:   const MatScalar   *aa = a->a, *v;
191:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192:   const PetscScalar *b;

194:   PetscFunctionBegin;
195:   PetscCall(VecGetArrayRead(bb, &b));
196:   PetscCall(VecGetArray(xx, &x));
197:   t = a->solve_work;

199:   PetscCall(ISGetIndices(isrow, &rout));
200:   r = rout;
201:   PetscCall(ISGetIndices(iscol, &cout));
202:   c = cout;

204:   /* forward solve the lower triangular */
205:   idx  = 7 * r[0];
206:   t[0] = b[idx];
207:   t[1] = b[1 + idx];
208:   t[2] = b[2 + idx];
209:   t[3] = b[3 + idx];
210:   t[4] = b[4 + idx];
211:   t[5] = b[5 + idx];
212:   t[6] = b[6 + idx];

214:   for (i = 1; i < n; i++) {
215:     v   = aa + 49 * ai[i];
216:     vi  = aj + ai[i];
217:     nz  = ai[i + 1] - ai[i];
218:     idx = 7 * r[i];
219:     s1  = b[idx];
220:     s2  = b[1 + idx];
221:     s3  = b[2 + idx];
222:     s4  = b[3 + idx];
223:     s5  = b[4 + idx];
224:     s6  = b[5 + idx];
225:     s7  = b[6 + idx];
226:     for (m = 0; m < nz; m++) {
227:       idx = 7 * vi[m];
228:       x1  = t[idx];
229:       x2  = t[1 + idx];
230:       x3  = t[2 + idx];
231:       x4  = t[3 + idx];
232:       x5  = t[4 + idx];
233:       x6  = t[5 + idx];
234:       x7  = t[6 + idx];
235:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242:       v += 49;
243:     }
244:     idx        = 7 * i;
245:     t[idx]     = s1;
246:     t[1 + idx] = s2;
247:     t[2 + idx] = s3;
248:     t[3 + idx] = s4;
249:     t[4 + idx] = s5;
250:     t[5 + idx] = s6;
251:     t[6 + idx] = s7;
252:   }
253:   /* backward solve the upper triangular */
254:   for (i = n - 1; i >= 0; i--) {
255:     v   = aa + 49 * (adiag[i + 1] + 1);
256:     vi  = aj + adiag[i + 1] + 1;
257:     nz  = adiag[i] - adiag[i + 1] - 1;
258:     idt = 7 * i;
259:     s1  = t[idt];
260:     s2  = t[1 + idt];
261:     s3  = t[2 + idt];
262:     s4  = t[3 + idt];
263:     s5  = t[4 + idt];
264:     s6  = t[5 + idt];
265:     s7  = t[6 + idt];
266:     for (m = 0; m < nz; m++) {
267:       idx = 7 * vi[m];
268:       x1  = t[idx];
269:       x2  = t[1 + idx];
270:       x3  = t[2 + idx];
271:       x4  = t[3 + idx];
272:       x5  = t[4 + idx];
273:       x6  = t[5 + idx];
274:       x7  = t[6 + idx];
275:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282:       v += 49;
283:     }
284:     idc    = 7 * c[i];
285:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292:   }

294:   PetscCall(ISRestoreIndices(isrow, &rout));
295:   PetscCall(ISRestoreIndices(iscol, &cout));
296:   PetscCall(VecRestoreArrayRead(bb, &b));
297:   PetscCall(VecRestoreArray(xx, &x));
298:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299:   PetscFunctionReturn(PETSC_SUCCESS);
300: }

302: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303: {
304:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
305:   IS                 iscol = a->col, isrow = a->row;
306:   const PetscInt    *r, *c, *rout, *cout;
307:   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308:   PetscInt           i, nz, idx, idt, idc;
309:   const MatScalar   *aa = a->a, *v;
310:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311:   const PetscScalar *b;

313:   PetscFunctionBegin;
314:   PetscCall(VecGetArrayRead(bb, &b));
315:   PetscCall(VecGetArray(xx, &x));
316:   t = a->solve_work;

318:   PetscCall(ISGetIndices(isrow, &rout));
319:   r = rout;
320:   PetscCall(ISGetIndices(iscol, &cout));
321:   c = cout + (n - 1);

323:   /* forward solve the lower triangular */
324:   idx  = 6 * (*r++);
325:   t[0] = b[idx];
326:   t[1] = b[1 + idx];
327:   t[2] = b[2 + idx];
328:   t[3] = b[3 + idx];
329:   t[4] = b[4 + idx];
330:   t[5] = b[5 + idx];
331:   for (i = 1; i < n; i++) {
332:     v   = aa + 36 * ai[i];
333:     vi  = aj + ai[i];
334:     nz  = diag[i] - ai[i];
335:     idx = 6 * (*r++);
336:     s1  = b[idx];
337:     s2  = b[1 + idx];
338:     s3  = b[2 + idx];
339:     s4  = b[3 + idx];
340:     s5  = b[4 + idx];
341:     s6  = b[5 + idx];
342:     while (nz--) {
343:       idx = 6 * (*vi++);
344:       x1  = t[idx];
345:       x2  = t[1 + idx];
346:       x3  = t[2 + idx];
347:       x4  = t[3 + idx];
348:       x5  = t[4 + idx];
349:       x6  = t[5 + idx];
350:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356:       v += 36;
357:     }
358:     idx        = 6 * i;
359:     t[idx]     = s1;
360:     t[1 + idx] = s2;
361:     t[2 + idx] = s3;
362:     t[3 + idx] = s4;
363:     t[4 + idx] = s5;
364:     t[5 + idx] = s6;
365:   }
366:   /* backward solve the upper triangular */
367:   for (i = n - 1; i >= 0; i--) {
368:     v   = aa + 36 * diag[i] + 36;
369:     vi  = aj + diag[i] + 1;
370:     nz  = ai[i + 1] - diag[i] - 1;
371:     idt = 6 * i;
372:     s1  = t[idt];
373:     s2  = t[1 + idt];
374:     s3  = t[2 + idt];
375:     s4  = t[3 + idt];
376:     s5  = t[4 + idt];
377:     s6  = t[5 + idt];
378:     while (nz--) {
379:       idx = 6 * (*vi++);
380:       x1  = t[idx];
381:       x2  = t[1 + idx];
382:       x3  = t[2 + idx];
383:       x4  = t[3 + idx];
384:       x5  = t[4 + idx];
385:       x6  = t[5 + idx];
386:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392:       v += 36;
393:     }
394:     idc    = 6 * (*c--);
395:     v      = aa + 36 * diag[i];
396:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402:   }

404:   PetscCall(ISRestoreIndices(isrow, &rout));
405:   PetscCall(ISRestoreIndices(iscol, &cout));
406:   PetscCall(VecRestoreArrayRead(bb, &b));
407:   PetscCall(VecRestoreArray(xx, &x));
408:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409:   PetscFunctionReturn(PETSC_SUCCESS);
410: }

412: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413: {
414:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
415:   IS                 iscol = a->col, isrow = a->row;
416:   const PetscInt    *r, *c, *rout, *cout;
417:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418:   PetscInt           i, nz, idx, idt, idc, m;
419:   const MatScalar   *aa = a->a, *v;
420:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421:   const PetscScalar *b;

423:   PetscFunctionBegin;
424:   PetscCall(VecGetArrayRead(bb, &b));
425:   PetscCall(VecGetArray(xx, &x));
426:   t = a->solve_work;

428:   PetscCall(ISGetIndices(isrow, &rout));
429:   r = rout;
430:   PetscCall(ISGetIndices(iscol, &cout));
431:   c = cout;

433:   /* forward solve the lower triangular */
434:   idx  = 6 * r[0];
435:   t[0] = b[idx];
436:   t[1] = b[1 + idx];
437:   t[2] = b[2 + idx];
438:   t[3] = b[3 + idx];
439:   t[4] = b[4 + idx];
440:   t[5] = b[5 + idx];
441:   for (i = 1; i < n; i++) {
442:     v   = aa + 36 * ai[i];
443:     vi  = aj + ai[i];
444:     nz  = ai[i + 1] - ai[i];
445:     idx = 6 * r[i];
446:     s1  = b[idx];
447:     s2  = b[1 + idx];
448:     s3  = b[2 + idx];
449:     s4  = b[3 + idx];
450:     s5  = b[4 + idx];
451:     s6  = b[5 + idx];
452:     for (m = 0; m < nz; m++) {
453:       idx = 6 * vi[m];
454:       x1  = t[idx];
455:       x2  = t[1 + idx];
456:       x3  = t[2 + idx];
457:       x4  = t[3 + idx];
458:       x5  = t[4 + idx];
459:       x6  = t[5 + idx];
460:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466:       v += 36;
467:     }
468:     idx        = 6 * i;
469:     t[idx]     = s1;
470:     t[1 + idx] = s2;
471:     t[2 + idx] = s3;
472:     t[3 + idx] = s4;
473:     t[4 + idx] = s5;
474:     t[5 + idx] = s6;
475:   }
476:   /* backward solve the upper triangular */
477:   for (i = n - 1; i >= 0; i--) {
478:     v   = aa + 36 * (adiag[i + 1] + 1);
479:     vi  = aj + adiag[i + 1] + 1;
480:     nz  = adiag[i] - adiag[i + 1] - 1;
481:     idt = 6 * i;
482:     s1  = t[idt];
483:     s2  = t[1 + idt];
484:     s3  = t[2 + idt];
485:     s4  = t[3 + idt];
486:     s5  = t[4 + idt];
487:     s6  = t[5 + idt];
488:     for (m = 0; m < nz; m++) {
489:       idx = 6 * vi[m];
490:       x1  = t[idx];
491:       x2  = t[1 + idx];
492:       x3  = t[2 + idx];
493:       x4  = t[3 + idx];
494:       x5  = t[4 + idx];
495:       x6  = t[5 + idx];
496:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502:       v += 36;
503:     }
504:     idc    = 6 * c[i];
505:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511:   }

513:   PetscCall(ISRestoreIndices(isrow, &rout));
514:   PetscCall(ISRestoreIndices(iscol, &cout));
515:   PetscCall(VecRestoreArrayRead(bb, &b));
516:   PetscCall(VecRestoreArray(xx, &x));
517:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518:   PetscFunctionReturn(PETSC_SUCCESS);
519: }

521: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522: {
523:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
524:   IS                 iscol = a->col, isrow = a->row;
525:   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
526:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527:   PetscInt           i, nz, idx, idt, idc;
528:   const MatScalar   *aa = a->a, *v;
529:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530:   const PetscScalar *b;

532:   PetscFunctionBegin;
533:   PetscCall(VecGetArrayRead(bb, &b));
534:   PetscCall(VecGetArray(xx, &x));
535:   t = a->solve_work;

537:   PetscCall(ISGetIndices(isrow, &rout));
538:   r = rout;
539:   PetscCall(ISGetIndices(iscol, &cout));
540:   c = cout + (n - 1);

542:   /* forward solve the lower triangular */
543:   idx  = 5 * (*r++);
544:   t[0] = b[idx];
545:   t[1] = b[1 + idx];
546:   t[2] = b[2 + idx];
547:   t[3] = b[3 + idx];
548:   t[4] = b[4 + idx];
549:   for (i = 1; i < n; i++) {
550:     v   = aa + 25 * ai[i];
551:     vi  = aj + ai[i];
552:     nz  = diag[i] - ai[i];
553:     idx = 5 * (*r++);
554:     s1  = b[idx];
555:     s2  = b[1 + idx];
556:     s3  = b[2 + idx];
557:     s4  = b[3 + idx];
558:     s5  = b[4 + idx];
559:     while (nz--) {
560:       idx = 5 * (*vi++);
561:       x1  = t[idx];
562:       x2  = t[1 + idx];
563:       x3  = t[2 + idx];
564:       x4  = t[3 + idx];
565:       x5  = t[4 + idx];
566:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571:       v += 25;
572:     }
573:     idx        = 5 * i;
574:     t[idx]     = s1;
575:     t[1 + idx] = s2;
576:     t[2 + idx] = s3;
577:     t[3 + idx] = s4;
578:     t[4 + idx] = s5;
579:   }
580:   /* backward solve the upper triangular */
581:   for (i = n - 1; i >= 0; i--) {
582:     v   = aa + 25 * diag[i] + 25;
583:     vi  = aj + diag[i] + 1;
584:     nz  = ai[i + 1] - diag[i] - 1;
585:     idt = 5 * i;
586:     s1  = t[idt];
587:     s2  = t[1 + idt];
588:     s3  = t[2 + idt];
589:     s4  = t[3 + idt];
590:     s5  = t[4 + idt];
591:     while (nz--) {
592:       idx = 5 * (*vi++);
593:       x1  = t[idx];
594:       x2  = t[1 + idx];
595:       x3  = t[2 + idx];
596:       x4  = t[3 + idx];
597:       x5  = t[4 + idx];
598:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603:       v += 25;
604:     }
605:     idc    = 5 * (*c--);
606:     v      = aa + 25 * diag[i];
607:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612:   }

614:   PetscCall(ISRestoreIndices(isrow, &rout));
615:   PetscCall(ISRestoreIndices(iscol, &cout));
616:   PetscCall(VecRestoreArrayRead(bb, &b));
617:   PetscCall(VecRestoreArray(xx, &x));
618:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619:   PetscFunctionReturn(PETSC_SUCCESS);
620: }

622: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623: {
624:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
625:   IS                 iscol = a->col, isrow = a->row;
626:   const PetscInt    *r, *c, *rout, *cout;
627:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628:   PetscInt           i, nz, idx, idt, idc, m;
629:   const MatScalar   *aa = a->a, *v;
630:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631:   const PetscScalar *b;

633:   PetscFunctionBegin;
634:   PetscCall(VecGetArrayRead(bb, &b));
635:   PetscCall(VecGetArray(xx, &x));
636:   t = a->solve_work;

638:   PetscCall(ISGetIndices(isrow, &rout));
639:   r = rout;
640:   PetscCall(ISGetIndices(iscol, &cout));
641:   c = cout;

643:   /* forward solve the lower triangular */
644:   idx  = 5 * r[0];
645:   t[0] = b[idx];
646:   t[1] = b[1 + idx];
647:   t[2] = b[2 + idx];
648:   t[3] = b[3 + idx];
649:   t[4] = b[4 + idx];
650:   for (i = 1; i < n; i++) {
651:     v   = aa + 25 * ai[i];
652:     vi  = aj + ai[i];
653:     nz  = ai[i + 1] - ai[i];
654:     idx = 5 * r[i];
655:     s1  = b[idx];
656:     s2  = b[1 + idx];
657:     s3  = b[2 + idx];
658:     s4  = b[3 + idx];
659:     s5  = b[4 + idx];
660:     for (m = 0; m < nz; m++) {
661:       idx = 5 * vi[m];
662:       x1  = t[idx];
663:       x2  = t[1 + idx];
664:       x3  = t[2 + idx];
665:       x4  = t[3 + idx];
666:       x5  = t[4 + idx];
667:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672:       v += 25;
673:     }
674:     idx        = 5 * i;
675:     t[idx]     = s1;
676:     t[1 + idx] = s2;
677:     t[2 + idx] = s3;
678:     t[3 + idx] = s4;
679:     t[4 + idx] = s5;
680:   }
681:   /* backward solve the upper triangular */
682:   for (i = n - 1; i >= 0; i--) {
683:     v   = aa + 25 * (adiag[i + 1] + 1);
684:     vi  = aj + adiag[i + 1] + 1;
685:     nz  = adiag[i] - adiag[i + 1] - 1;
686:     idt = 5 * i;
687:     s1  = t[idt];
688:     s2  = t[1 + idt];
689:     s3  = t[2 + idt];
690:     s4  = t[3 + idt];
691:     s5  = t[4 + idt];
692:     for (m = 0; m < nz; m++) {
693:       idx = 5 * vi[m];
694:       x1  = t[idx];
695:       x2  = t[1 + idx];
696:       x3  = t[2 + idx];
697:       x4  = t[3 + idx];
698:       x5  = t[4 + idx];
699:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704:       v += 25;
705:     }
706:     idc    = 5 * c[i];
707:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712:   }

714:   PetscCall(ISRestoreIndices(isrow, &rout));
715:   PetscCall(ISRestoreIndices(iscol, &cout));
716:   PetscCall(VecRestoreArrayRead(bb, &b));
717:   PetscCall(VecRestoreArray(xx, &x));
718:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719:   PetscFunctionReturn(PETSC_SUCCESS);
720: }

722: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723: {
724:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
725:   IS                 iscol = a->col, isrow = a->row;
726:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727:   PetscInt           i, nz, idx, idt, idc;
728:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
729:   const MatScalar   *aa = a->a, *v;
730:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731:   const PetscScalar *b;

733:   PetscFunctionBegin;
734:   PetscCall(VecGetArrayRead(bb, &b));
735:   PetscCall(VecGetArray(xx, &x));
736:   t = a->solve_work;

738:   PetscCall(ISGetIndices(isrow, &rout));
739:   r = rout;
740:   PetscCall(ISGetIndices(iscol, &cout));
741:   c = cout + (n - 1);

743:   /* forward solve the lower triangular */
744:   idx  = 4 * (*r++);
745:   t[0] = b[idx];
746:   t[1] = b[1 + idx];
747:   t[2] = b[2 + idx];
748:   t[3] = b[3 + idx];
749:   for (i = 1; i < n; i++) {
750:     v   = aa + 16 * ai[i];
751:     vi  = aj + ai[i];
752:     nz  = diag[i] - ai[i];
753:     idx = 4 * (*r++);
754:     s1  = b[idx];
755:     s2  = b[1 + idx];
756:     s3  = b[2 + idx];
757:     s4  = b[3 + idx];
758:     while (nz--) {
759:       idx = 4 * (*vi++);
760:       x1  = t[idx];
761:       x2  = t[1 + idx];
762:       x3  = t[2 + idx];
763:       x4  = t[3 + idx];
764:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768:       v += 16;
769:     }
770:     idx        = 4 * i;
771:     t[idx]     = s1;
772:     t[1 + idx] = s2;
773:     t[2 + idx] = s3;
774:     t[3 + idx] = s4;
775:   }
776:   /* backward solve the upper triangular */
777:   for (i = n - 1; i >= 0; i--) {
778:     v   = aa + 16 * diag[i] + 16;
779:     vi  = aj + diag[i] + 1;
780:     nz  = ai[i + 1] - diag[i] - 1;
781:     idt = 4 * i;
782:     s1  = t[idt];
783:     s2  = t[1 + idt];
784:     s3  = t[2 + idt];
785:     s4  = t[3 + idt];
786:     while (nz--) {
787:       idx = 4 * (*vi++);
788:       x1  = t[idx];
789:       x2  = t[1 + idx];
790:       x3  = t[2 + idx];
791:       x4  = t[3 + idx];
792:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796:       v += 16;
797:     }
798:     idc    = 4 * (*c--);
799:     v      = aa + 16 * diag[i];
800:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804:   }

806:   PetscCall(ISRestoreIndices(isrow, &rout));
807:   PetscCall(ISRestoreIndices(iscol, &cout));
808:   PetscCall(VecRestoreArrayRead(bb, &b));
809:   PetscCall(VecRestoreArray(xx, &x));
810:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811:   PetscFunctionReturn(PETSC_SUCCESS);
812: }

814: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815: {
816:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
817:   IS                 iscol = a->col, isrow = a->row;
818:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819:   PetscInt           i, nz, idx, idt, idc, m;
820:   const PetscInt    *r, *c, *rout, *cout;
821:   const MatScalar   *aa = a->a, *v;
822:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823:   const PetscScalar *b;

825:   PetscFunctionBegin;
826:   PetscCall(VecGetArrayRead(bb, &b));
827:   PetscCall(VecGetArray(xx, &x));
828:   t = a->solve_work;

830:   PetscCall(ISGetIndices(isrow, &rout));
831:   r = rout;
832:   PetscCall(ISGetIndices(iscol, &cout));
833:   c = cout;

835:   /* forward solve the lower triangular */
836:   idx  = 4 * r[0];
837:   t[0] = b[idx];
838:   t[1] = b[1 + idx];
839:   t[2] = b[2 + idx];
840:   t[3] = b[3 + idx];
841:   for (i = 1; i < n; i++) {
842:     v   = aa + 16 * ai[i];
843:     vi  = aj + ai[i];
844:     nz  = ai[i + 1] - ai[i];
845:     idx = 4 * r[i];
846:     s1  = b[idx];
847:     s2  = b[1 + idx];
848:     s3  = b[2 + idx];
849:     s4  = b[3 + idx];
850:     for (m = 0; m < nz; m++) {
851:       idx = 4 * vi[m];
852:       x1  = t[idx];
853:       x2  = t[1 + idx];
854:       x3  = t[2 + idx];
855:       x4  = t[3 + idx];
856:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860:       v += 16;
861:     }
862:     idx        = 4 * i;
863:     t[idx]     = s1;
864:     t[1 + idx] = s2;
865:     t[2 + idx] = s3;
866:     t[3 + idx] = s4;
867:   }
868:   /* backward solve the upper triangular */
869:   for (i = n - 1; i >= 0; i--) {
870:     v   = aa + 16 * (adiag[i + 1] + 1);
871:     vi  = aj + adiag[i + 1] + 1;
872:     nz  = adiag[i] - adiag[i + 1] - 1;
873:     idt = 4 * i;
874:     s1  = t[idt];
875:     s2  = t[1 + idt];
876:     s3  = t[2 + idt];
877:     s4  = t[3 + idt];
878:     for (m = 0; m < nz; m++) {
879:       idx = 4 * vi[m];
880:       x1  = t[idx];
881:       x2  = t[1 + idx];
882:       x3  = t[2 + idx];
883:       x4  = t[3 + idx];
884:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888:       v += 16;
889:     }
890:     idc    = 4 * c[i];
891:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895:   }

897:   PetscCall(ISRestoreIndices(isrow, &rout));
898:   PetscCall(ISRestoreIndices(iscol, &cout));
899:   PetscCall(VecRestoreArrayRead(bb, &b));
900:   PetscCall(VecRestoreArray(xx, &x));
901:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902:   PetscFunctionReturn(PETSC_SUCCESS);
903: }

905: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
906: {
907:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
908:   IS                 iscol = a->col, isrow = a->row;
909:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
910:   PetscInt           i, nz, idx, idt, idc;
911:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
912:   const MatScalar   *aa = a->a, *v;
913:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
914:   const PetscScalar *b;

916:   PetscFunctionBegin;
917:   PetscCall(VecGetArrayRead(bb, &b));
918:   PetscCall(VecGetArray(xx, &x));
919:   t = a->solve_work;

921:   PetscCall(ISGetIndices(isrow, &rout));
922:   r = rout;
923:   PetscCall(ISGetIndices(iscol, &cout));
924:   c = cout + (n - 1);

926:   /* forward solve the lower triangular */
927:   idx  = 3 * (*r++);
928:   t[0] = b[idx];
929:   t[1] = b[1 + idx];
930:   t[2] = b[2 + idx];
931:   for (i = 1; i < n; i++) {
932:     v   = aa + 9 * ai[i];
933:     vi  = aj + ai[i];
934:     nz  = diag[i] - ai[i];
935:     idx = 3 * (*r++);
936:     s1  = b[idx];
937:     s2  = b[1 + idx];
938:     s3  = b[2 + idx];
939:     while (nz--) {
940:       idx = 3 * (*vi++);
941:       x1  = t[idx];
942:       x2  = t[1 + idx];
943:       x3  = t[2 + idx];
944:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
945:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
946:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
947:       v += 9;
948:     }
949:     idx        = 3 * i;
950:     t[idx]     = s1;
951:     t[1 + idx] = s2;
952:     t[2 + idx] = s3;
953:   }
954:   /* backward solve the upper triangular */
955:   for (i = n - 1; i >= 0; i--) {
956:     v   = aa + 9 * diag[i] + 9;
957:     vi  = aj + diag[i] + 1;
958:     nz  = ai[i + 1] - diag[i] - 1;
959:     idt = 3 * i;
960:     s1  = t[idt];
961:     s2  = t[1 + idt];
962:     s3  = t[2 + idt];
963:     while (nz--) {
964:       idx = 3 * (*vi++);
965:       x1  = t[idx];
966:       x2  = t[1 + idx];
967:       x3  = t[2 + idx];
968:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
969:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
970:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
971:       v += 9;
972:     }
973:     idc    = 3 * (*c--);
974:     v      = aa + 9 * diag[i];
975:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
976:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
977:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
978:   }
979:   PetscCall(ISRestoreIndices(isrow, &rout));
980:   PetscCall(ISRestoreIndices(iscol, &cout));
981:   PetscCall(VecRestoreArrayRead(bb, &b));
982:   PetscCall(VecRestoreArray(xx, &x));
983:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
988: {
989:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
990:   IS                 iscol = a->col, isrow = a->row;
991:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
992:   PetscInt           i, nz, idx, idt, idc, m;
993:   const PetscInt    *r, *c, *rout, *cout;
994:   const MatScalar   *aa = a->a, *v;
995:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
996:   const PetscScalar *b;

998:   PetscFunctionBegin;
999:   PetscCall(VecGetArrayRead(bb, &b));
1000:   PetscCall(VecGetArray(xx, &x));
1001:   t = a->solve_work;

1003:   PetscCall(ISGetIndices(isrow, &rout));
1004:   r = rout;
1005:   PetscCall(ISGetIndices(iscol, &cout));
1006:   c = cout;

1008:   /* forward solve the lower triangular */
1009:   idx  = 3 * r[0];
1010:   t[0] = b[idx];
1011:   t[1] = b[1 + idx];
1012:   t[2] = b[2 + idx];
1013:   for (i = 1; i < n; i++) {
1014:     v   = aa + 9 * ai[i];
1015:     vi  = aj + ai[i];
1016:     nz  = ai[i + 1] - ai[i];
1017:     idx = 3 * r[i];
1018:     s1  = b[idx];
1019:     s2  = b[1 + idx];
1020:     s3  = b[2 + idx];
1021:     for (m = 0; m < nz; m++) {
1022:       idx = 3 * vi[m];
1023:       x1  = t[idx];
1024:       x2  = t[1 + idx];
1025:       x3  = t[2 + idx];
1026:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1027:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1028:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1029:       v += 9;
1030:     }
1031:     idx        = 3 * i;
1032:     t[idx]     = s1;
1033:     t[1 + idx] = s2;
1034:     t[2 + idx] = s3;
1035:   }
1036:   /* backward solve the upper triangular */
1037:   for (i = n - 1; i >= 0; i--) {
1038:     v   = aa + 9 * (adiag[i + 1] + 1);
1039:     vi  = aj + adiag[i + 1] + 1;
1040:     nz  = adiag[i] - adiag[i + 1] - 1;
1041:     idt = 3 * i;
1042:     s1  = t[idt];
1043:     s2  = t[1 + idt];
1044:     s3  = t[2 + idt];
1045:     for (m = 0; m < nz; m++) {
1046:       idx = 3 * vi[m];
1047:       x1  = t[idx];
1048:       x2  = t[1 + idx];
1049:       x3  = t[2 + idx];
1050:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1051:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1052:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1053:       v += 9;
1054:     }
1055:     idc    = 3 * c[i];
1056:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1057:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1058:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1059:   }
1060:   PetscCall(ISRestoreIndices(isrow, &rout));
1061:   PetscCall(ISRestoreIndices(iscol, &cout));
1062:   PetscCall(VecRestoreArrayRead(bb, &b));
1063:   PetscCall(VecRestoreArray(xx, &x));
1064:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1065:   PetscFunctionReturn(PETSC_SUCCESS);
1066: }

1068: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1069: {
1070:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1071:   IS                 iscol = a->col, isrow = a->row;
1072:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1073:   PetscInt           i, nz, idx, idt, idc;
1074:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1075:   const MatScalar   *aa = a->a, *v;
1076:   PetscScalar       *x, s1, s2, x1, x2, *t;
1077:   const PetscScalar *b;

1079:   PetscFunctionBegin;
1080:   PetscCall(VecGetArrayRead(bb, &b));
1081:   PetscCall(VecGetArray(xx, &x));
1082:   t = a->solve_work;

1084:   PetscCall(ISGetIndices(isrow, &rout));
1085:   r = rout;
1086:   PetscCall(ISGetIndices(iscol, &cout));
1087:   c = cout + (n - 1);

1089:   /* forward solve the lower triangular */
1090:   idx  = 2 * (*r++);
1091:   t[0] = b[idx];
1092:   t[1] = b[1 + idx];
1093:   for (i = 1; i < n; i++) {
1094:     v   = aa + 4 * ai[i];
1095:     vi  = aj + ai[i];
1096:     nz  = diag[i] - ai[i];
1097:     idx = 2 * (*r++);
1098:     s1  = b[idx];
1099:     s2  = b[1 + idx];
1100:     while (nz--) {
1101:       idx = 2 * (*vi++);
1102:       x1  = t[idx];
1103:       x2  = t[1 + idx];
1104:       s1 -= v[0] * x1 + v[2] * x2;
1105:       s2 -= v[1] * x1 + v[3] * x2;
1106:       v += 4;
1107:     }
1108:     idx        = 2 * i;
1109:     t[idx]     = s1;
1110:     t[1 + idx] = s2;
1111:   }
1112:   /* backward solve the upper triangular */
1113:   for (i = n - 1; i >= 0; i--) {
1114:     v   = aa + 4 * diag[i] + 4;
1115:     vi  = aj + diag[i] + 1;
1116:     nz  = ai[i + 1] - diag[i] - 1;
1117:     idt = 2 * i;
1118:     s1  = t[idt];
1119:     s2  = t[1 + idt];
1120:     while (nz--) {
1121:       idx = 2 * (*vi++);
1122:       x1  = t[idx];
1123:       x2  = t[1 + idx];
1124:       s1 -= v[0] * x1 + v[2] * x2;
1125:       s2 -= v[1] * x1 + v[3] * x2;
1126:       v += 4;
1127:     }
1128:     idc    = 2 * (*c--);
1129:     v      = aa + 4 * diag[i];
1130:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1131:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1132:   }
1133:   PetscCall(ISRestoreIndices(isrow, &rout));
1134:   PetscCall(ISRestoreIndices(iscol, &cout));
1135:   PetscCall(VecRestoreArrayRead(bb, &b));
1136:   PetscCall(VecRestoreArray(xx, &x));
1137:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1138:   PetscFunctionReturn(PETSC_SUCCESS);
1139: }

1141: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1142: {
1143:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1144:   IS                 iscol = a->col, isrow = a->row;
1145:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1146:   PetscInt           i, nz, idx, jdx, idt, idc, m;
1147:   const PetscInt    *r, *c, *rout, *cout;
1148:   const MatScalar   *aa = a->a, *v;
1149:   PetscScalar       *x, s1, s2, x1, x2, *t;
1150:   const PetscScalar *b;

1152:   PetscFunctionBegin;
1153:   PetscCall(VecGetArrayRead(bb, &b));
1154:   PetscCall(VecGetArray(xx, &x));
1155:   t = a->solve_work;

1157:   PetscCall(ISGetIndices(isrow, &rout));
1158:   r = rout;
1159:   PetscCall(ISGetIndices(iscol, &cout));
1160:   c = cout;

1162:   /* forward solve the lower triangular */
1163:   idx  = 2 * r[0];
1164:   t[0] = b[idx];
1165:   t[1] = b[1 + idx];
1166:   for (i = 1; i < n; i++) {
1167:     v   = aa + 4 * ai[i];
1168:     vi  = aj + ai[i];
1169:     nz  = ai[i + 1] - ai[i];
1170:     idx = 2 * r[i];
1171:     s1  = b[idx];
1172:     s2  = b[1 + idx];
1173:     for (m = 0; m < nz; m++) {
1174:       jdx = 2 * vi[m];
1175:       x1  = t[jdx];
1176:       x2  = t[1 + jdx];
1177:       s1 -= v[0] * x1 + v[2] * x2;
1178:       s2 -= v[1] * x1 + v[3] * x2;
1179:       v += 4;
1180:     }
1181:     idx        = 2 * i;
1182:     t[idx]     = s1;
1183:     t[1 + idx] = s2;
1184:   }
1185:   /* backward solve the upper triangular */
1186:   for (i = n - 1; i >= 0; i--) {
1187:     v   = aa + 4 * (adiag[i + 1] + 1);
1188:     vi  = aj + adiag[i + 1] + 1;
1189:     nz  = adiag[i] - adiag[i + 1] - 1;
1190:     idt = 2 * i;
1191:     s1  = t[idt];
1192:     s2  = t[1 + idt];
1193:     for (m = 0; m < nz; m++) {
1194:       idx = 2 * vi[m];
1195:       x1  = t[idx];
1196:       x2  = t[1 + idx];
1197:       s1 -= v[0] * x1 + v[2] * x2;
1198:       s2 -= v[1] * x1 + v[3] * x2;
1199:       v += 4;
1200:     }
1201:     idc    = 2 * c[i];
1202:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1203:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1204:   }
1205:   PetscCall(ISRestoreIndices(isrow, &rout));
1206:   PetscCall(ISRestoreIndices(iscol, &cout));
1207:   PetscCall(VecRestoreArrayRead(bb, &b));
1208:   PetscCall(VecRestoreArray(xx, &x));
1209:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1210:   PetscFunctionReturn(PETSC_SUCCESS);
1211: }

1213: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1214: {
1215:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1216:   IS                 iscol = a->col, isrow = a->row;
1217:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1218:   PetscInt           i, nz;
1219:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1220:   const MatScalar   *aa = a->a, *v;
1221:   PetscScalar       *x, s1, *t;
1222:   const PetscScalar *b;

1224:   PetscFunctionBegin;
1225:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1227:   PetscCall(VecGetArrayRead(bb, &b));
1228:   PetscCall(VecGetArray(xx, &x));
1229:   t = a->solve_work;

1231:   PetscCall(ISGetIndices(isrow, &rout));
1232:   r = rout;
1233:   PetscCall(ISGetIndices(iscol, &cout));
1234:   c = cout + (n - 1);

1236:   /* forward solve the lower triangular */
1237:   t[0] = b[*r++];
1238:   for (i = 1; i < n; i++) {
1239:     v  = aa + ai[i];
1240:     vi = aj + ai[i];
1241:     nz = diag[i] - ai[i];
1242:     s1 = b[*r++];
1243:     while (nz--) s1 -= (*v++) * t[*vi++];
1244:     t[i] = s1;
1245:   }
1246:   /* backward solve the upper triangular */
1247:   for (i = n - 1; i >= 0; i--) {
1248:     v  = aa + diag[i] + 1;
1249:     vi = aj + diag[i] + 1;
1250:     nz = ai[i + 1] - diag[i] - 1;
1251:     s1 = t[i];
1252:     while (nz--) s1 -= (*v++) * t[*vi++];
1253:     x[*c--] = t[i] = aa[diag[i]] * s1;
1254:   }

1256:   PetscCall(ISRestoreIndices(isrow, &rout));
1257:   PetscCall(ISRestoreIndices(iscol, &cout));
1258:   PetscCall(VecRestoreArrayRead(bb, &b));
1259:   PetscCall(VecRestoreArray(xx, &x));
1260:   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1261:   PetscFunctionReturn(PETSC_SUCCESS);
1262: }

1264: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1265: {
1266:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1267:   IS                 iscol = a->col, isrow = a->row;
1268:   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1269:   const PetscInt    *rout, *cout, *r, *c;
1270:   PetscScalar       *x, *tmp, sum;
1271:   const PetscScalar *b;
1272:   const MatScalar   *aa = a->a, *v;

1274:   PetscFunctionBegin;
1275:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1277:   PetscCall(VecGetArrayRead(bb, &b));
1278:   PetscCall(VecGetArray(xx, &x));
1279:   tmp = a->solve_work;

1281:   PetscCall(ISGetIndices(isrow, &rout));
1282:   r = rout;
1283:   PetscCall(ISGetIndices(iscol, &cout));
1284:   c = cout;

1286:   /* forward solve the lower triangular */
1287:   tmp[0] = b[r[0]];
1288:   v      = aa;
1289:   vi     = aj;
1290:   for (i = 1; i < n; i++) {
1291:     nz  = ai[i + 1] - ai[i];
1292:     sum = b[r[i]];
1293:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1294:     tmp[i] = sum;
1295:     v += nz;
1296:     vi += nz;
1297:   }

1299:   /* backward solve the upper triangular */
1300:   for (i = n - 1; i >= 0; i--) {
1301:     v   = aa + adiag[i + 1] + 1;
1302:     vi  = aj + adiag[i + 1] + 1;
1303:     nz  = adiag[i] - adiag[i + 1] - 1;
1304:     sum = tmp[i];
1305:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1306:     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1307:   }

1309:   PetscCall(ISRestoreIndices(isrow, &rout));
1310:   PetscCall(ISRestoreIndices(iscol, &cout));
1311:   PetscCall(VecRestoreArrayRead(bb, &b));
1312:   PetscCall(VecRestoreArray(xx, &x));
1313:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1314:   PetscFunctionReturn(PETSC_SUCCESS);
1315: }