Actual source code: baijsolv.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7: IS iscol = a->col, isrow = a->row;
8: const PetscInt *r, *c, *rout, *cout;
9: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10: PetscInt i, nz;
11: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar *x, *s, *t, *ls;
14: const PetscScalar *b;
16: PetscFunctionBegin;
17: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
18: PetscCall(VecGetArrayRead(bb, &b));
19: PetscCall(VecGetArray(xx, &x));
20: t = a->solve_work;
22: PetscCall(ISGetIndices(isrow, &rout));
23: r = rout;
24: PetscCall(ISGetIndices(iscol, &cout));
25: c = cout + (n - 1);
27: /* forward solve the lower triangular */
28: PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
29: for (i = 1; i < n; i++) {
30: v = aa + bs2 * ai[i];
31: vi = aj + ai[i];
32: nz = a->diag[i] - ai[i];
33: s = t + bs * i;
34: PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
35: while (nz--) {
36: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
37: v += bs2;
38: }
39: }
40: /* backward solve the upper triangular */
41: ls = a->solve_work + A->cmap->n;
42: for (i = n - 1; i >= 0; i--) {
43: v = aa + bs2 * (a->diag[i] + 1);
44: vi = aj + a->diag[i] + 1;
45: nz = ai[i + 1] - a->diag[i] - 1;
46: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
47: while (nz--) {
48: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
49: v += bs2;
50: }
51: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
52: PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
53: }
55: PetscCall(ISRestoreIndices(isrow, &rout));
56: PetscCall(ISRestoreIndices(iscol, &cout));
57: PetscCall(VecRestoreArrayRead(bb, &b));
58: PetscCall(VecRestoreArray(xx, &x));
59: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
64: {
65: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
66: IS iscol = a->col, isrow = a->row;
67: const PetscInt *r, *c, *ai = a->i, *aj = a->j;
68: const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
69: PetscInt i, nz, idx, idt, idc;
70: const MatScalar *aa = a->a, *v;
71: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
72: const PetscScalar *b;
74: PetscFunctionBegin;
75: PetscCall(VecGetArrayRead(bb, &b));
76: PetscCall(VecGetArray(xx, &x));
77: t = a->solve_work;
79: PetscCall(ISGetIndices(isrow, &rout));
80: r = rout;
81: PetscCall(ISGetIndices(iscol, &cout));
82: c = cout + (n - 1);
84: /* forward solve the lower triangular */
85: idx = 7 * (*r++);
86: t[0] = b[idx];
87: t[1] = b[1 + idx];
88: t[2] = b[2 + idx];
89: t[3] = b[3 + idx];
90: t[4] = b[4 + idx];
91: t[5] = b[5 + idx];
92: t[6] = b[6 + idx];
94: for (i = 1; i < n; i++) {
95: v = aa + 49 * ai[i];
96: vi = aj + ai[i];
97: nz = diag[i] - ai[i];
98: idx = 7 * (*r++);
99: s1 = b[idx];
100: s2 = b[1 + idx];
101: s3 = b[2 + idx];
102: s4 = b[3 + idx];
103: s5 = b[4 + idx];
104: s6 = b[5 + idx];
105: s7 = b[6 + idx];
106: while (nz--) {
107: idx = 7 * (*vi++);
108: x1 = t[idx];
109: x2 = t[1 + idx];
110: x3 = t[2 + idx];
111: x4 = t[3 + idx];
112: x5 = t[4 + idx];
113: x6 = t[5 + idx];
114: x7 = t[6 + idx];
115: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122: v += 49;
123: }
124: idx = 7 * i;
125: t[idx] = s1;
126: t[1 + idx] = s2;
127: t[2 + idx] = s3;
128: t[3 + idx] = s4;
129: t[4 + idx] = s5;
130: t[5 + idx] = s6;
131: t[6 + idx] = s7;
132: }
133: /* backward solve the upper triangular */
134: for (i = n - 1; i >= 0; i--) {
135: v = aa + 49 * diag[i] + 49;
136: vi = aj + diag[i] + 1;
137: nz = ai[i + 1] - diag[i] - 1;
138: idt = 7 * i;
139: s1 = t[idt];
140: s2 = t[1 + idt];
141: s3 = t[2 + idt];
142: s4 = t[3 + idt];
143: s5 = t[4 + idt];
144: s6 = t[5 + idt];
145: s7 = t[6 + idt];
146: while (nz--) {
147: idx = 7 * (*vi++);
148: x1 = t[idx];
149: x2 = t[1 + idx];
150: x3 = t[2 + idx];
151: x4 = t[3 + idx];
152: x5 = t[4 + idx];
153: x6 = t[5 + idx];
154: x7 = t[6 + idx];
155: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162: v += 49;
163: }
164: idc = 7 * (*c--);
165: v = aa + 49 * diag[i];
166: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173: }
175: PetscCall(ISRestoreIndices(isrow, &rout));
176: PetscCall(ISRestoreIndices(iscol, &cout));
177: PetscCall(VecRestoreArrayRead(bb, &b));
178: PetscCall(VecRestoreArray(xx, &x));
179: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184: {
185: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
186: IS iscol = a->col, isrow = a->row;
187: const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188: const PetscInt n = a->mbs, *rout, *cout, *vi;
189: PetscInt i, nz, idx, idt, idc, m;
190: const MatScalar *aa = a->a, *v;
191: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192: const PetscScalar *b;
194: PetscFunctionBegin;
195: PetscCall(VecGetArrayRead(bb, &b));
196: PetscCall(VecGetArray(xx, &x));
197: t = a->solve_work;
199: PetscCall(ISGetIndices(isrow, &rout));
200: r = rout;
201: PetscCall(ISGetIndices(iscol, &cout));
202: c = cout;
204: /* forward solve the lower triangular */
205: idx = 7 * r[0];
206: t[0] = b[idx];
207: t[1] = b[1 + idx];
208: t[2] = b[2 + idx];
209: t[3] = b[3 + idx];
210: t[4] = b[4 + idx];
211: t[5] = b[5 + idx];
212: t[6] = b[6 + idx];
214: for (i = 1; i < n; i++) {
215: v = aa + 49 * ai[i];
216: vi = aj + ai[i];
217: nz = ai[i + 1] - ai[i];
218: idx = 7 * r[i];
219: s1 = b[idx];
220: s2 = b[1 + idx];
221: s3 = b[2 + idx];
222: s4 = b[3 + idx];
223: s5 = b[4 + idx];
224: s6 = b[5 + idx];
225: s7 = b[6 + idx];
226: for (m = 0; m < nz; m++) {
227: idx = 7 * vi[m];
228: x1 = t[idx];
229: x2 = t[1 + idx];
230: x3 = t[2 + idx];
231: x4 = t[3 + idx];
232: x5 = t[4 + idx];
233: x6 = t[5 + idx];
234: x7 = t[6 + idx];
235: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242: v += 49;
243: }
244: idx = 7 * i;
245: t[idx] = s1;
246: t[1 + idx] = s2;
247: t[2 + idx] = s3;
248: t[3 + idx] = s4;
249: t[4 + idx] = s5;
250: t[5 + idx] = s6;
251: t[6 + idx] = s7;
252: }
253: /* backward solve the upper triangular */
254: for (i = n - 1; i >= 0; i--) {
255: v = aa + 49 * (adiag[i + 1] + 1);
256: vi = aj + adiag[i + 1] + 1;
257: nz = adiag[i] - adiag[i + 1] - 1;
258: idt = 7 * i;
259: s1 = t[idt];
260: s2 = t[1 + idt];
261: s3 = t[2 + idt];
262: s4 = t[3 + idt];
263: s5 = t[4 + idt];
264: s6 = t[5 + idt];
265: s7 = t[6 + idt];
266: for (m = 0; m < nz; m++) {
267: idx = 7 * vi[m];
268: x1 = t[idx];
269: x2 = t[1 + idx];
270: x3 = t[2 + idx];
271: x4 = t[3 + idx];
272: x5 = t[4 + idx];
273: x6 = t[5 + idx];
274: x7 = t[6 + idx];
275: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282: v += 49;
283: }
284: idc = 7 * c[i];
285: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292: }
294: PetscCall(ISRestoreIndices(isrow, &rout));
295: PetscCall(ISRestoreIndices(iscol, &cout));
296: PetscCall(VecRestoreArrayRead(bb, &b));
297: PetscCall(VecRestoreArray(xx, &x));
298: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299: PetscFunctionReturn(PETSC_SUCCESS);
300: }
302: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303: {
304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
305: IS iscol = a->col, isrow = a->row;
306: const PetscInt *r, *c, *rout, *cout;
307: const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308: PetscInt i, nz, idx, idt, idc;
309: const MatScalar *aa = a->a, *v;
310: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311: const PetscScalar *b;
313: PetscFunctionBegin;
314: PetscCall(VecGetArrayRead(bb, &b));
315: PetscCall(VecGetArray(xx, &x));
316: t = a->solve_work;
318: PetscCall(ISGetIndices(isrow, &rout));
319: r = rout;
320: PetscCall(ISGetIndices(iscol, &cout));
321: c = cout + (n - 1);
323: /* forward solve the lower triangular */
324: idx = 6 * (*r++);
325: t[0] = b[idx];
326: t[1] = b[1 + idx];
327: t[2] = b[2 + idx];
328: t[3] = b[3 + idx];
329: t[4] = b[4 + idx];
330: t[5] = b[5 + idx];
331: for (i = 1; i < n; i++) {
332: v = aa + 36 * ai[i];
333: vi = aj + ai[i];
334: nz = diag[i] - ai[i];
335: idx = 6 * (*r++);
336: s1 = b[idx];
337: s2 = b[1 + idx];
338: s3 = b[2 + idx];
339: s4 = b[3 + idx];
340: s5 = b[4 + idx];
341: s6 = b[5 + idx];
342: while (nz--) {
343: idx = 6 * (*vi++);
344: x1 = t[idx];
345: x2 = t[1 + idx];
346: x3 = t[2 + idx];
347: x4 = t[3 + idx];
348: x5 = t[4 + idx];
349: x6 = t[5 + idx];
350: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356: v += 36;
357: }
358: idx = 6 * i;
359: t[idx] = s1;
360: t[1 + idx] = s2;
361: t[2 + idx] = s3;
362: t[3 + idx] = s4;
363: t[4 + idx] = s5;
364: t[5 + idx] = s6;
365: }
366: /* backward solve the upper triangular */
367: for (i = n - 1; i >= 0; i--) {
368: v = aa + 36 * diag[i] + 36;
369: vi = aj + diag[i] + 1;
370: nz = ai[i + 1] - diag[i] - 1;
371: idt = 6 * i;
372: s1 = t[idt];
373: s2 = t[1 + idt];
374: s3 = t[2 + idt];
375: s4 = t[3 + idt];
376: s5 = t[4 + idt];
377: s6 = t[5 + idt];
378: while (nz--) {
379: idx = 6 * (*vi++);
380: x1 = t[idx];
381: x2 = t[1 + idx];
382: x3 = t[2 + idx];
383: x4 = t[3 + idx];
384: x5 = t[4 + idx];
385: x6 = t[5 + idx];
386: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392: v += 36;
393: }
394: idc = 6 * (*c--);
395: v = aa + 36 * diag[i];
396: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402: }
404: PetscCall(ISRestoreIndices(isrow, &rout));
405: PetscCall(ISRestoreIndices(iscol, &cout));
406: PetscCall(VecRestoreArrayRead(bb, &b));
407: PetscCall(VecRestoreArray(xx, &x));
408: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409: PetscFunctionReturn(PETSC_SUCCESS);
410: }
412: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413: {
414: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
415: IS iscol = a->col, isrow = a->row;
416: const PetscInt *r, *c, *rout, *cout;
417: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418: PetscInt i, nz, idx, idt, idc, m;
419: const MatScalar *aa = a->a, *v;
420: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421: const PetscScalar *b;
423: PetscFunctionBegin;
424: PetscCall(VecGetArrayRead(bb, &b));
425: PetscCall(VecGetArray(xx, &x));
426: t = a->solve_work;
428: PetscCall(ISGetIndices(isrow, &rout));
429: r = rout;
430: PetscCall(ISGetIndices(iscol, &cout));
431: c = cout;
433: /* forward solve the lower triangular */
434: idx = 6 * r[0];
435: t[0] = b[idx];
436: t[1] = b[1 + idx];
437: t[2] = b[2 + idx];
438: t[3] = b[3 + idx];
439: t[4] = b[4 + idx];
440: t[5] = b[5 + idx];
441: for (i = 1; i < n; i++) {
442: v = aa + 36 * ai[i];
443: vi = aj + ai[i];
444: nz = ai[i + 1] - ai[i];
445: idx = 6 * r[i];
446: s1 = b[idx];
447: s2 = b[1 + idx];
448: s3 = b[2 + idx];
449: s4 = b[3 + idx];
450: s5 = b[4 + idx];
451: s6 = b[5 + idx];
452: for (m = 0; m < nz; m++) {
453: idx = 6 * vi[m];
454: x1 = t[idx];
455: x2 = t[1 + idx];
456: x3 = t[2 + idx];
457: x4 = t[3 + idx];
458: x5 = t[4 + idx];
459: x6 = t[5 + idx];
460: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466: v += 36;
467: }
468: idx = 6 * i;
469: t[idx] = s1;
470: t[1 + idx] = s2;
471: t[2 + idx] = s3;
472: t[3 + idx] = s4;
473: t[4 + idx] = s5;
474: t[5 + idx] = s6;
475: }
476: /* backward solve the upper triangular */
477: for (i = n - 1; i >= 0; i--) {
478: v = aa + 36 * (adiag[i + 1] + 1);
479: vi = aj + adiag[i + 1] + 1;
480: nz = adiag[i] - adiag[i + 1] - 1;
481: idt = 6 * i;
482: s1 = t[idt];
483: s2 = t[1 + idt];
484: s3 = t[2 + idt];
485: s4 = t[3 + idt];
486: s5 = t[4 + idt];
487: s6 = t[5 + idt];
488: for (m = 0; m < nz; m++) {
489: idx = 6 * vi[m];
490: x1 = t[idx];
491: x2 = t[1 + idx];
492: x3 = t[2 + idx];
493: x4 = t[3 + idx];
494: x5 = t[4 + idx];
495: x6 = t[5 + idx];
496: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502: v += 36;
503: }
504: idc = 6 * c[i];
505: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511: }
513: PetscCall(ISRestoreIndices(isrow, &rout));
514: PetscCall(ISRestoreIndices(iscol, &cout));
515: PetscCall(VecRestoreArrayRead(bb, &b));
516: PetscCall(VecRestoreArray(xx, &x));
517: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518: PetscFunctionReturn(PETSC_SUCCESS);
519: }
521: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522: {
523: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
524: IS iscol = a->col, isrow = a->row;
525: const PetscInt *r, *c, *rout, *cout, *diag = a->diag;
526: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527: PetscInt i, nz, idx, idt, idc;
528: const MatScalar *aa = a->a, *v;
529: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530: const PetscScalar *b;
532: PetscFunctionBegin;
533: PetscCall(VecGetArrayRead(bb, &b));
534: PetscCall(VecGetArray(xx, &x));
535: t = a->solve_work;
537: PetscCall(ISGetIndices(isrow, &rout));
538: r = rout;
539: PetscCall(ISGetIndices(iscol, &cout));
540: c = cout + (n - 1);
542: /* forward solve the lower triangular */
543: idx = 5 * (*r++);
544: t[0] = b[idx];
545: t[1] = b[1 + idx];
546: t[2] = b[2 + idx];
547: t[3] = b[3 + idx];
548: t[4] = b[4 + idx];
549: for (i = 1; i < n; i++) {
550: v = aa + 25 * ai[i];
551: vi = aj + ai[i];
552: nz = diag[i] - ai[i];
553: idx = 5 * (*r++);
554: s1 = b[idx];
555: s2 = b[1 + idx];
556: s3 = b[2 + idx];
557: s4 = b[3 + idx];
558: s5 = b[4 + idx];
559: while (nz--) {
560: idx = 5 * (*vi++);
561: x1 = t[idx];
562: x2 = t[1 + idx];
563: x3 = t[2 + idx];
564: x4 = t[3 + idx];
565: x5 = t[4 + idx];
566: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571: v += 25;
572: }
573: idx = 5 * i;
574: t[idx] = s1;
575: t[1 + idx] = s2;
576: t[2 + idx] = s3;
577: t[3 + idx] = s4;
578: t[4 + idx] = s5;
579: }
580: /* backward solve the upper triangular */
581: for (i = n - 1; i >= 0; i--) {
582: v = aa + 25 * diag[i] + 25;
583: vi = aj + diag[i] + 1;
584: nz = ai[i + 1] - diag[i] - 1;
585: idt = 5 * i;
586: s1 = t[idt];
587: s2 = t[1 + idt];
588: s3 = t[2 + idt];
589: s4 = t[3 + idt];
590: s5 = t[4 + idt];
591: while (nz--) {
592: idx = 5 * (*vi++);
593: x1 = t[idx];
594: x2 = t[1 + idx];
595: x3 = t[2 + idx];
596: x4 = t[3 + idx];
597: x5 = t[4 + idx];
598: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603: v += 25;
604: }
605: idc = 5 * (*c--);
606: v = aa + 25 * diag[i];
607: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612: }
614: PetscCall(ISRestoreIndices(isrow, &rout));
615: PetscCall(ISRestoreIndices(iscol, &cout));
616: PetscCall(VecRestoreArrayRead(bb, &b));
617: PetscCall(VecRestoreArray(xx, &x));
618: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619: PetscFunctionReturn(PETSC_SUCCESS);
620: }
622: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623: {
624: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
625: IS iscol = a->col, isrow = a->row;
626: const PetscInt *r, *c, *rout, *cout;
627: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628: PetscInt i, nz, idx, idt, idc, m;
629: const MatScalar *aa = a->a, *v;
630: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631: const PetscScalar *b;
633: PetscFunctionBegin;
634: PetscCall(VecGetArrayRead(bb, &b));
635: PetscCall(VecGetArray(xx, &x));
636: t = a->solve_work;
638: PetscCall(ISGetIndices(isrow, &rout));
639: r = rout;
640: PetscCall(ISGetIndices(iscol, &cout));
641: c = cout;
643: /* forward solve the lower triangular */
644: idx = 5 * r[0];
645: t[0] = b[idx];
646: t[1] = b[1 + idx];
647: t[2] = b[2 + idx];
648: t[3] = b[3 + idx];
649: t[4] = b[4 + idx];
650: for (i = 1; i < n; i++) {
651: v = aa + 25 * ai[i];
652: vi = aj + ai[i];
653: nz = ai[i + 1] - ai[i];
654: idx = 5 * r[i];
655: s1 = b[idx];
656: s2 = b[1 + idx];
657: s3 = b[2 + idx];
658: s4 = b[3 + idx];
659: s5 = b[4 + idx];
660: for (m = 0; m < nz; m++) {
661: idx = 5 * vi[m];
662: x1 = t[idx];
663: x2 = t[1 + idx];
664: x3 = t[2 + idx];
665: x4 = t[3 + idx];
666: x5 = t[4 + idx];
667: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672: v += 25;
673: }
674: idx = 5 * i;
675: t[idx] = s1;
676: t[1 + idx] = s2;
677: t[2 + idx] = s3;
678: t[3 + idx] = s4;
679: t[4 + idx] = s5;
680: }
681: /* backward solve the upper triangular */
682: for (i = n - 1; i >= 0; i--) {
683: v = aa + 25 * (adiag[i + 1] + 1);
684: vi = aj + adiag[i + 1] + 1;
685: nz = adiag[i] - adiag[i + 1] - 1;
686: idt = 5 * i;
687: s1 = t[idt];
688: s2 = t[1 + idt];
689: s3 = t[2 + idt];
690: s4 = t[3 + idt];
691: s5 = t[4 + idt];
692: for (m = 0; m < nz; m++) {
693: idx = 5 * vi[m];
694: x1 = t[idx];
695: x2 = t[1 + idx];
696: x3 = t[2 + idx];
697: x4 = t[3 + idx];
698: x5 = t[4 + idx];
699: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704: v += 25;
705: }
706: idc = 5 * c[i];
707: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712: }
714: PetscCall(ISRestoreIndices(isrow, &rout));
715: PetscCall(ISRestoreIndices(iscol, &cout));
716: PetscCall(VecRestoreArrayRead(bb, &b));
717: PetscCall(VecRestoreArray(xx, &x));
718: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719: PetscFunctionReturn(PETSC_SUCCESS);
720: }
722: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723: {
724: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
725: IS iscol = a->col, isrow = a->row;
726: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727: PetscInt i, nz, idx, idt, idc;
728: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
729: const MatScalar *aa = a->a, *v;
730: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731: const PetscScalar *b;
733: PetscFunctionBegin;
734: PetscCall(VecGetArrayRead(bb, &b));
735: PetscCall(VecGetArray(xx, &x));
736: t = a->solve_work;
738: PetscCall(ISGetIndices(isrow, &rout));
739: r = rout;
740: PetscCall(ISGetIndices(iscol, &cout));
741: c = cout + (n - 1);
743: /* forward solve the lower triangular */
744: idx = 4 * (*r++);
745: t[0] = b[idx];
746: t[1] = b[1 + idx];
747: t[2] = b[2 + idx];
748: t[3] = b[3 + idx];
749: for (i = 1; i < n; i++) {
750: v = aa + 16 * ai[i];
751: vi = aj + ai[i];
752: nz = diag[i] - ai[i];
753: idx = 4 * (*r++);
754: s1 = b[idx];
755: s2 = b[1 + idx];
756: s3 = b[2 + idx];
757: s4 = b[3 + idx];
758: while (nz--) {
759: idx = 4 * (*vi++);
760: x1 = t[idx];
761: x2 = t[1 + idx];
762: x3 = t[2 + idx];
763: x4 = t[3 + idx];
764: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768: v += 16;
769: }
770: idx = 4 * i;
771: t[idx] = s1;
772: t[1 + idx] = s2;
773: t[2 + idx] = s3;
774: t[3 + idx] = s4;
775: }
776: /* backward solve the upper triangular */
777: for (i = n - 1; i >= 0; i--) {
778: v = aa + 16 * diag[i] + 16;
779: vi = aj + diag[i] + 1;
780: nz = ai[i + 1] - diag[i] - 1;
781: idt = 4 * i;
782: s1 = t[idt];
783: s2 = t[1 + idt];
784: s3 = t[2 + idt];
785: s4 = t[3 + idt];
786: while (nz--) {
787: idx = 4 * (*vi++);
788: x1 = t[idx];
789: x2 = t[1 + idx];
790: x3 = t[2 + idx];
791: x4 = t[3 + idx];
792: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796: v += 16;
797: }
798: idc = 4 * (*c--);
799: v = aa + 16 * diag[i];
800: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804: }
806: PetscCall(ISRestoreIndices(isrow, &rout));
807: PetscCall(ISRestoreIndices(iscol, &cout));
808: PetscCall(VecRestoreArrayRead(bb, &b));
809: PetscCall(VecRestoreArray(xx, &x));
810: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811: PetscFunctionReturn(PETSC_SUCCESS);
812: }
814: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815: {
816: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
817: IS iscol = a->col, isrow = a->row;
818: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819: PetscInt i, nz, idx, idt, idc, m;
820: const PetscInt *r, *c, *rout, *cout;
821: const MatScalar *aa = a->a, *v;
822: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823: const PetscScalar *b;
825: PetscFunctionBegin;
826: PetscCall(VecGetArrayRead(bb, &b));
827: PetscCall(VecGetArray(xx, &x));
828: t = a->solve_work;
830: PetscCall(ISGetIndices(isrow, &rout));
831: r = rout;
832: PetscCall(ISGetIndices(iscol, &cout));
833: c = cout;
835: /* forward solve the lower triangular */
836: idx = 4 * r[0];
837: t[0] = b[idx];
838: t[1] = b[1 + idx];
839: t[2] = b[2 + idx];
840: t[3] = b[3 + idx];
841: for (i = 1; i < n; i++) {
842: v = aa + 16 * ai[i];
843: vi = aj + ai[i];
844: nz = ai[i + 1] - ai[i];
845: idx = 4 * r[i];
846: s1 = b[idx];
847: s2 = b[1 + idx];
848: s3 = b[2 + idx];
849: s4 = b[3 + idx];
850: for (m = 0; m < nz; m++) {
851: idx = 4 * vi[m];
852: x1 = t[idx];
853: x2 = t[1 + idx];
854: x3 = t[2 + idx];
855: x4 = t[3 + idx];
856: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860: v += 16;
861: }
862: idx = 4 * i;
863: t[idx] = s1;
864: t[1 + idx] = s2;
865: t[2 + idx] = s3;
866: t[3 + idx] = s4;
867: }
868: /* backward solve the upper triangular */
869: for (i = n - 1; i >= 0; i--) {
870: v = aa + 16 * (adiag[i + 1] + 1);
871: vi = aj + adiag[i + 1] + 1;
872: nz = adiag[i] - adiag[i + 1] - 1;
873: idt = 4 * i;
874: s1 = t[idt];
875: s2 = t[1 + idt];
876: s3 = t[2 + idt];
877: s4 = t[3 + idt];
878: for (m = 0; m < nz; m++) {
879: idx = 4 * vi[m];
880: x1 = t[idx];
881: x2 = t[1 + idx];
882: x3 = t[2 + idx];
883: x4 = t[3 + idx];
884: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888: v += 16;
889: }
890: idc = 4 * c[i];
891: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895: }
897: PetscCall(ISRestoreIndices(isrow, &rout));
898: PetscCall(ISRestoreIndices(iscol, &cout));
899: PetscCall(VecRestoreArrayRead(bb, &b));
900: PetscCall(VecRestoreArray(xx, &x));
901: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902: PetscFunctionReturn(PETSC_SUCCESS);
903: }
905: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
906: {
907: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
908: IS iscol = a->col, isrow = a->row;
909: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
910: PetscInt i, nz, idx, idt, idc;
911: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
912: const MatScalar *aa = a->a, *v;
913: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
914: const PetscScalar *b;
916: PetscFunctionBegin;
917: PetscCall(VecGetArrayRead(bb, &b));
918: PetscCall(VecGetArray(xx, &x));
919: t = a->solve_work;
921: PetscCall(ISGetIndices(isrow, &rout));
922: r = rout;
923: PetscCall(ISGetIndices(iscol, &cout));
924: c = cout + (n - 1);
926: /* forward solve the lower triangular */
927: idx = 3 * (*r++);
928: t[0] = b[idx];
929: t[1] = b[1 + idx];
930: t[2] = b[2 + idx];
931: for (i = 1; i < n; i++) {
932: v = aa + 9 * ai[i];
933: vi = aj + ai[i];
934: nz = diag[i] - ai[i];
935: idx = 3 * (*r++);
936: s1 = b[idx];
937: s2 = b[1 + idx];
938: s3 = b[2 + idx];
939: while (nz--) {
940: idx = 3 * (*vi++);
941: x1 = t[idx];
942: x2 = t[1 + idx];
943: x3 = t[2 + idx];
944: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
945: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
946: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
947: v += 9;
948: }
949: idx = 3 * i;
950: t[idx] = s1;
951: t[1 + idx] = s2;
952: t[2 + idx] = s3;
953: }
954: /* backward solve the upper triangular */
955: for (i = n - 1; i >= 0; i--) {
956: v = aa + 9 * diag[i] + 9;
957: vi = aj + diag[i] + 1;
958: nz = ai[i + 1] - diag[i] - 1;
959: idt = 3 * i;
960: s1 = t[idt];
961: s2 = t[1 + idt];
962: s3 = t[2 + idt];
963: while (nz--) {
964: idx = 3 * (*vi++);
965: x1 = t[idx];
966: x2 = t[1 + idx];
967: x3 = t[2 + idx];
968: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
969: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
970: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
971: v += 9;
972: }
973: idc = 3 * (*c--);
974: v = aa + 9 * diag[i];
975: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
976: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
977: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
978: }
979: PetscCall(ISRestoreIndices(isrow, &rout));
980: PetscCall(ISRestoreIndices(iscol, &cout));
981: PetscCall(VecRestoreArrayRead(bb, &b));
982: PetscCall(VecRestoreArray(xx, &x));
983: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
988: {
989: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
990: IS iscol = a->col, isrow = a->row;
991: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
992: PetscInt i, nz, idx, idt, idc, m;
993: const PetscInt *r, *c, *rout, *cout;
994: const MatScalar *aa = a->a, *v;
995: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
996: const PetscScalar *b;
998: PetscFunctionBegin;
999: PetscCall(VecGetArrayRead(bb, &b));
1000: PetscCall(VecGetArray(xx, &x));
1001: t = a->solve_work;
1003: PetscCall(ISGetIndices(isrow, &rout));
1004: r = rout;
1005: PetscCall(ISGetIndices(iscol, &cout));
1006: c = cout;
1008: /* forward solve the lower triangular */
1009: idx = 3 * r[0];
1010: t[0] = b[idx];
1011: t[1] = b[1 + idx];
1012: t[2] = b[2 + idx];
1013: for (i = 1; i < n; i++) {
1014: v = aa + 9 * ai[i];
1015: vi = aj + ai[i];
1016: nz = ai[i + 1] - ai[i];
1017: idx = 3 * r[i];
1018: s1 = b[idx];
1019: s2 = b[1 + idx];
1020: s3 = b[2 + idx];
1021: for (m = 0; m < nz; m++) {
1022: idx = 3 * vi[m];
1023: x1 = t[idx];
1024: x2 = t[1 + idx];
1025: x3 = t[2 + idx];
1026: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1027: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1028: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1029: v += 9;
1030: }
1031: idx = 3 * i;
1032: t[idx] = s1;
1033: t[1 + idx] = s2;
1034: t[2 + idx] = s3;
1035: }
1036: /* backward solve the upper triangular */
1037: for (i = n - 1; i >= 0; i--) {
1038: v = aa + 9 * (adiag[i + 1] + 1);
1039: vi = aj + adiag[i + 1] + 1;
1040: nz = adiag[i] - adiag[i + 1] - 1;
1041: idt = 3 * i;
1042: s1 = t[idt];
1043: s2 = t[1 + idt];
1044: s3 = t[2 + idt];
1045: for (m = 0; m < nz; m++) {
1046: idx = 3 * vi[m];
1047: x1 = t[idx];
1048: x2 = t[1 + idx];
1049: x3 = t[2 + idx];
1050: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1051: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1052: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1053: v += 9;
1054: }
1055: idc = 3 * c[i];
1056: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1057: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1058: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1059: }
1060: PetscCall(ISRestoreIndices(isrow, &rout));
1061: PetscCall(ISRestoreIndices(iscol, &cout));
1062: PetscCall(VecRestoreArrayRead(bb, &b));
1063: PetscCall(VecRestoreArray(xx, &x));
1064: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1065: PetscFunctionReturn(PETSC_SUCCESS);
1066: }
1068: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1069: {
1070: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1071: IS iscol = a->col, isrow = a->row;
1072: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1073: PetscInt i, nz, idx, idt, idc;
1074: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1075: const MatScalar *aa = a->a, *v;
1076: PetscScalar *x, s1, s2, x1, x2, *t;
1077: const PetscScalar *b;
1079: PetscFunctionBegin;
1080: PetscCall(VecGetArrayRead(bb, &b));
1081: PetscCall(VecGetArray(xx, &x));
1082: t = a->solve_work;
1084: PetscCall(ISGetIndices(isrow, &rout));
1085: r = rout;
1086: PetscCall(ISGetIndices(iscol, &cout));
1087: c = cout + (n - 1);
1089: /* forward solve the lower triangular */
1090: idx = 2 * (*r++);
1091: t[0] = b[idx];
1092: t[1] = b[1 + idx];
1093: for (i = 1; i < n; i++) {
1094: v = aa + 4 * ai[i];
1095: vi = aj + ai[i];
1096: nz = diag[i] - ai[i];
1097: idx = 2 * (*r++);
1098: s1 = b[idx];
1099: s2 = b[1 + idx];
1100: while (nz--) {
1101: idx = 2 * (*vi++);
1102: x1 = t[idx];
1103: x2 = t[1 + idx];
1104: s1 -= v[0] * x1 + v[2] * x2;
1105: s2 -= v[1] * x1 + v[3] * x2;
1106: v += 4;
1107: }
1108: idx = 2 * i;
1109: t[idx] = s1;
1110: t[1 + idx] = s2;
1111: }
1112: /* backward solve the upper triangular */
1113: for (i = n - 1; i >= 0; i--) {
1114: v = aa + 4 * diag[i] + 4;
1115: vi = aj + diag[i] + 1;
1116: nz = ai[i + 1] - diag[i] - 1;
1117: idt = 2 * i;
1118: s1 = t[idt];
1119: s2 = t[1 + idt];
1120: while (nz--) {
1121: idx = 2 * (*vi++);
1122: x1 = t[idx];
1123: x2 = t[1 + idx];
1124: s1 -= v[0] * x1 + v[2] * x2;
1125: s2 -= v[1] * x1 + v[3] * x2;
1126: v += 4;
1127: }
1128: idc = 2 * (*c--);
1129: v = aa + 4 * diag[i];
1130: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1131: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1132: }
1133: PetscCall(ISRestoreIndices(isrow, &rout));
1134: PetscCall(ISRestoreIndices(iscol, &cout));
1135: PetscCall(VecRestoreArrayRead(bb, &b));
1136: PetscCall(VecRestoreArray(xx, &x));
1137: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1138: PetscFunctionReturn(PETSC_SUCCESS);
1139: }
1141: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1142: {
1143: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1144: IS iscol = a->col, isrow = a->row;
1145: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1146: PetscInt i, nz, idx, jdx, idt, idc, m;
1147: const PetscInt *r, *c, *rout, *cout;
1148: const MatScalar *aa = a->a, *v;
1149: PetscScalar *x, s1, s2, x1, x2, *t;
1150: const PetscScalar *b;
1152: PetscFunctionBegin;
1153: PetscCall(VecGetArrayRead(bb, &b));
1154: PetscCall(VecGetArray(xx, &x));
1155: t = a->solve_work;
1157: PetscCall(ISGetIndices(isrow, &rout));
1158: r = rout;
1159: PetscCall(ISGetIndices(iscol, &cout));
1160: c = cout;
1162: /* forward solve the lower triangular */
1163: idx = 2 * r[0];
1164: t[0] = b[idx];
1165: t[1] = b[1 + idx];
1166: for (i = 1; i < n; i++) {
1167: v = aa + 4 * ai[i];
1168: vi = aj + ai[i];
1169: nz = ai[i + 1] - ai[i];
1170: idx = 2 * r[i];
1171: s1 = b[idx];
1172: s2 = b[1 + idx];
1173: for (m = 0; m < nz; m++) {
1174: jdx = 2 * vi[m];
1175: x1 = t[jdx];
1176: x2 = t[1 + jdx];
1177: s1 -= v[0] * x1 + v[2] * x2;
1178: s2 -= v[1] * x1 + v[3] * x2;
1179: v += 4;
1180: }
1181: idx = 2 * i;
1182: t[idx] = s1;
1183: t[1 + idx] = s2;
1184: }
1185: /* backward solve the upper triangular */
1186: for (i = n - 1; i >= 0; i--) {
1187: v = aa + 4 * (adiag[i + 1] + 1);
1188: vi = aj + adiag[i + 1] + 1;
1189: nz = adiag[i] - adiag[i + 1] - 1;
1190: idt = 2 * i;
1191: s1 = t[idt];
1192: s2 = t[1 + idt];
1193: for (m = 0; m < nz; m++) {
1194: idx = 2 * vi[m];
1195: x1 = t[idx];
1196: x2 = t[1 + idx];
1197: s1 -= v[0] * x1 + v[2] * x2;
1198: s2 -= v[1] * x1 + v[3] * x2;
1199: v += 4;
1200: }
1201: idc = 2 * c[i];
1202: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1203: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1204: }
1205: PetscCall(ISRestoreIndices(isrow, &rout));
1206: PetscCall(ISRestoreIndices(iscol, &cout));
1207: PetscCall(VecRestoreArrayRead(bb, &b));
1208: PetscCall(VecRestoreArray(xx, &x));
1209: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1210: PetscFunctionReturn(PETSC_SUCCESS);
1211: }
1213: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1214: {
1215: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1216: IS iscol = a->col, isrow = a->row;
1217: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1218: PetscInt i, nz;
1219: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1220: const MatScalar *aa = a->a, *v;
1221: PetscScalar *x, s1, *t;
1222: const PetscScalar *b;
1224: PetscFunctionBegin;
1225: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1227: PetscCall(VecGetArrayRead(bb, &b));
1228: PetscCall(VecGetArray(xx, &x));
1229: t = a->solve_work;
1231: PetscCall(ISGetIndices(isrow, &rout));
1232: r = rout;
1233: PetscCall(ISGetIndices(iscol, &cout));
1234: c = cout + (n - 1);
1236: /* forward solve the lower triangular */
1237: t[0] = b[*r++];
1238: for (i = 1; i < n; i++) {
1239: v = aa + ai[i];
1240: vi = aj + ai[i];
1241: nz = diag[i] - ai[i];
1242: s1 = b[*r++];
1243: while (nz--) s1 -= (*v++) * t[*vi++];
1244: t[i] = s1;
1245: }
1246: /* backward solve the upper triangular */
1247: for (i = n - 1; i >= 0; i--) {
1248: v = aa + diag[i] + 1;
1249: vi = aj + diag[i] + 1;
1250: nz = ai[i + 1] - diag[i] - 1;
1251: s1 = t[i];
1252: while (nz--) s1 -= (*v++) * t[*vi++];
1253: x[*c--] = t[i] = aa[diag[i]] * s1;
1254: }
1256: PetscCall(ISRestoreIndices(isrow, &rout));
1257: PetscCall(ISRestoreIndices(iscol, &cout));
1258: PetscCall(VecRestoreArrayRead(bb, &b));
1259: PetscCall(VecRestoreArray(xx, &x));
1260: PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1261: PetscFunctionReturn(PETSC_SUCCESS);
1262: }
1264: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1265: {
1266: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1267: IS iscol = a->col, isrow = a->row;
1268: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1269: const PetscInt *rout, *cout, *r, *c;
1270: PetscScalar *x, *tmp, sum;
1271: const PetscScalar *b;
1272: const MatScalar *aa = a->a, *v;
1274: PetscFunctionBegin;
1275: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1277: PetscCall(VecGetArrayRead(bb, &b));
1278: PetscCall(VecGetArray(xx, &x));
1279: tmp = a->solve_work;
1281: PetscCall(ISGetIndices(isrow, &rout));
1282: r = rout;
1283: PetscCall(ISGetIndices(iscol, &cout));
1284: c = cout;
1286: /* forward solve the lower triangular */
1287: tmp[0] = b[r[0]];
1288: v = aa;
1289: vi = aj;
1290: for (i = 1; i < n; i++) {
1291: nz = ai[i + 1] - ai[i];
1292: sum = b[r[i]];
1293: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1294: tmp[i] = sum;
1295: v += nz;
1296: vi += nz;
1297: }
1299: /* backward solve the upper triangular */
1300: for (i = n - 1; i >= 0; i--) {
1301: v = aa + adiag[i + 1] + 1;
1302: vi = aj + adiag[i + 1] + 1;
1303: nz = adiag[i] - adiag[i + 1] - 1;
1304: sum = tmp[i];
1305: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1306: x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1307: }
1309: PetscCall(ISRestoreIndices(isrow, &rout));
1310: PetscCall(ISRestoreIndices(iscol, &cout));
1311: PetscCall(VecRestoreArrayRead(bb, &b));
1312: PetscCall(VecRestoreArray(xx, &x));
1313: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1314: PetscFunctionReturn(PETSC_SUCCESS);
1315: }