Actual source code: baijsolv.c
petsc-3.13.6 2020-09-29
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
5: {
6: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
7: IS iscol=a->col,isrow=a->row;
8: PetscErrorCode ierr;
9: const PetscInt *r,*c,*rout,*cout;
10: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
11: PetscInt i,nz;
12: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
13: const MatScalar *aa=a->a,*v;
14: PetscScalar *x,*s,*t,*ls;
15: const PetscScalar *b;
18: VecGetArrayRead(bb,&b);
19: VecGetArray(xx,&x);
20: t = a->solve_work;
22: ISGetIndices(isrow,&rout); r = rout;
23: ISGetIndices(iscol,&cout); c = cout + (n-1);
25: /* forward solve the lower triangular */
26: PetscArraycpy(t,b+bs*(*r++),bs);
27: for (i=1; i<n; i++) {
28: v = aa + bs2*ai[i];
29: vi = aj + ai[i];
30: nz = a->diag[i] - ai[i];
31: s = t + bs*i;
32: PetscArraycpy(s,b+bs*(*r++),bs);
33: while (nz--) {
34: PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
35: v += bs2;
36: }
37: }
38: /* backward solve the upper triangular */
39: ls = a->solve_work + A->cmap->n;
40: for (i=n-1; i>=0; i--) {
41: v = aa + bs2*(a->diag[i] + 1);
42: vi = aj + a->diag[i] + 1;
43: nz = ai[i+1] - a->diag[i] - 1;
44: PetscArraycpy(ls,t+i*bs,bs);
45: while (nz--) {
46: PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
47: v += bs2;
48: }
49: PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
50: PetscArraycpy(x + bs*(*c--),t+i*bs,bs);
51: }
53: ISRestoreIndices(isrow,&rout);
54: ISRestoreIndices(iscol,&cout);
55: VecRestoreArrayRead(bb,&b);
56: VecRestoreArray(xx,&x);
57: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
58: return(0);
59: }
61: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
62: {
63: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
64: IS iscol=a->col,isrow=a->row;
65: PetscErrorCode ierr;
66: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
67: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
68: PetscInt i,nz,idx,idt,idc;
69: const MatScalar *aa=a->a,*v;
70: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
71: const PetscScalar *b;
74: VecGetArrayRead(bb,&b);
75: VecGetArray(xx,&x);
76: t = a->solve_work;
78: ISGetIndices(isrow,&rout); r = rout;
79: ISGetIndices(iscol,&cout); c = cout + (n-1);
81: /* forward solve the lower triangular */
82: idx = 7*(*r++);
83: t[0] = b[idx]; t[1] = b[1+idx];
84: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
85: t[5] = b[5+idx]; t[6] = b[6+idx];
87: for (i=1; i<n; i++) {
88: v = aa + 49*ai[i];
89: vi = aj + ai[i];
90: nz = diag[i] - ai[i];
91: idx = 7*(*r++);
92: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
93: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
94: while (nz--) {
95: idx = 7*(*vi++);
96: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
97: x4 = t[3+idx];x5 = t[4+idx];
98: x6 = t[5+idx];x7 = t[6+idx];
99: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
100: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
101: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
102: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
103: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
104: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
105: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
106: v += 49;
107: }
108: idx = 7*i;
109: t[idx] = s1;t[1+idx] = s2;
110: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
111: t[5+idx] = s6;t[6+idx] = s7;
112: }
113: /* backward solve the upper triangular */
114: for (i=n-1; i>=0; i--) {
115: v = aa + 49*diag[i] + 49;
116: vi = aj + diag[i] + 1;
117: nz = ai[i+1] - diag[i] - 1;
118: idt = 7*i;
119: s1 = t[idt]; s2 = t[1+idt];
120: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
121: s6 = t[5+idt];s7 = t[6+idt];
122: while (nz--) {
123: idx = 7*(*vi++);
124: x1 = t[idx]; x2 = t[1+idx];
125: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
126: x6 = t[5+idx]; x7 = t[6+idx];
127: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
128: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
129: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
130: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
131: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
132: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
133: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134: v += 49;
135: }
136: idc = 7*(*c--);
137: v = aa + 49*diag[i];
138: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
139: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
140: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
141: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
142: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
143: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
144: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
145: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
146: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
147: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
148: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
149: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
150: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
151: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
152: }
154: ISRestoreIndices(isrow,&rout);
155: ISRestoreIndices(iscol,&cout);
156: VecRestoreArrayRead(bb,&b);
157: VecRestoreArray(xx,&x);
158: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
159: return(0);
160: }
162: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
163: {
164: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
165: IS iscol=a->col,isrow=a->row;
166: PetscErrorCode ierr;
167: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
168: const PetscInt n=a->mbs,*rout,*cout,*vi;
169: PetscInt i,nz,idx,idt,idc,m;
170: const MatScalar *aa=a->a,*v;
171: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
172: const PetscScalar *b;
175: VecGetArrayRead(bb,&b);
176: VecGetArray(xx,&x);
177: t = a->solve_work;
179: ISGetIndices(isrow,&rout); r = rout;
180: ISGetIndices(iscol,&cout); c = cout;
182: /* forward solve the lower triangular */
183: idx = 7*r[0];
184: t[0] = b[idx]; t[1] = b[1+idx];
185: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
186: t[5] = b[5+idx]; t[6] = b[6+idx];
188: for (i=1; i<n; i++) {
189: v = aa + 49*ai[i];
190: vi = aj + ai[i];
191: nz = ai[i+1] - ai[i];
192: idx = 7*r[i];
193: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
194: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
195: for (m=0; m<nz; m++) {
196: idx = 7*vi[m];
197: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
198: x4 = t[3+idx];x5 = t[4+idx];
199: x6 = t[5+idx];x7 = t[6+idx];
200: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
201: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
202: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
203: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
204: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
205: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
206: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
207: v += 49;
208: }
209: idx = 7*i;
210: t[idx] = s1;t[1+idx] = s2;
211: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
212: t[5+idx] = s6;t[6+idx] = s7;
213: }
214: /* backward solve the upper triangular */
215: for (i=n-1; i>=0; i--) {
216: v = aa + 49*(adiag[i+1]+1);
217: vi = aj + adiag[i+1]+1;
218: nz = adiag[i] - adiag[i+1] - 1;
219: idt = 7*i;
220: s1 = t[idt]; s2 = t[1+idt];
221: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
222: s6 = t[5+idt];s7 = t[6+idt];
223: for (m=0; m<nz; m++) {
224: idx = 7*vi[m];
225: x1 = t[idx]; x2 = t[1+idx];
226: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
227: x6 = t[5+idx]; x7 = t[6+idx];
228: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
229: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
230: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
231: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
232: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
233: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
234: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
235: v += 49;
236: }
237: idc = 7*c[i];
238: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
239: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
240: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
241: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
242: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
243: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
244: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
245: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
246: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
247: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
248: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
249: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
250: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
251: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
252: }
254: ISRestoreIndices(isrow,&rout);
255: ISRestoreIndices(iscol,&cout);
256: VecRestoreArrayRead(bb,&b);
257: VecRestoreArray(xx,&x);
258: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
259: return(0);
260: }
262: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
263: {
264: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
265: IS iscol=a->col,isrow=a->row;
266: PetscErrorCode ierr;
267: const PetscInt *r,*c,*rout,*cout;
268: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
269: PetscInt i,nz,idx,idt,idc;
270: const MatScalar *aa=a->a,*v;
271: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
272: const PetscScalar *b;
275: VecGetArrayRead(bb,&b);
276: VecGetArray(xx,&x);
277: t = a->solve_work;
279: ISGetIndices(isrow,&rout); r = rout;
280: ISGetIndices(iscol,&cout); c = cout + (n-1);
282: /* forward solve the lower triangular */
283: idx = 6*(*r++);
284: t[0] = b[idx]; t[1] = b[1+idx];
285: t[2] = b[2+idx]; t[3] = b[3+idx];
286: t[4] = b[4+idx]; t[5] = b[5+idx];
287: for (i=1; i<n; i++) {
288: v = aa + 36*ai[i];
289: vi = aj + ai[i];
290: nz = diag[i] - ai[i];
291: idx = 6*(*r++);
292: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
293: s5 = b[4+idx]; s6 = b[5+idx];
294: while (nz--) {
295: idx = 6*(*vi++);
296: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
297: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
298: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
299: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
300: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
301: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
302: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
303: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
304: v += 36;
305: }
306: idx = 6*i;
307: t[idx] = s1;t[1+idx] = s2;
308: t[2+idx] = s3;t[3+idx] = s4;
309: t[4+idx] = s5;t[5+idx] = s6;
310: }
311: /* backward solve the upper triangular */
312: for (i=n-1; i>=0; i--) {
313: v = aa + 36*diag[i] + 36;
314: vi = aj + diag[i] + 1;
315: nz = ai[i+1] - diag[i] - 1;
316: idt = 6*i;
317: s1 = t[idt]; s2 = t[1+idt];
318: s3 = t[2+idt];s4 = t[3+idt];
319: s5 = t[4+idt];s6 = t[5+idt];
320: while (nz--) {
321: idx = 6*(*vi++);
322: x1 = t[idx]; x2 = t[1+idx];
323: x3 = t[2+idx]; x4 = t[3+idx];
324: x5 = t[4+idx]; x6 = t[5+idx];
325: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
326: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
327: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
328: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
329: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
330: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
331: v += 36;
332: }
333: idc = 6*(*c--);
334: v = aa + 36*diag[i];
335: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
336: v[18]*s4+v[24]*s5+v[30]*s6;
337: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
338: v[19]*s4+v[25]*s5+v[31]*s6;
339: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
340: v[20]*s4+v[26]*s5+v[32]*s6;
341: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
342: v[21]*s4+v[27]*s5+v[33]*s6;
343: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
344: v[22]*s4+v[28]*s5+v[34]*s6;
345: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
346: v[23]*s4+v[29]*s5+v[35]*s6;
347: }
349: ISRestoreIndices(isrow,&rout);
350: ISRestoreIndices(iscol,&cout);
351: VecRestoreArrayRead(bb,&b);
352: VecRestoreArray(xx,&x);
353: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
354: return(0);
355: }
357: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
358: {
359: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
360: IS iscol=a->col,isrow=a->row;
361: PetscErrorCode ierr;
362: const PetscInt *r,*c,*rout,*cout;
363: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
364: PetscInt i,nz,idx,idt,idc,m;
365: const MatScalar *aa=a->a,*v;
366: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
367: const PetscScalar *b;
370: VecGetArrayRead(bb,&b);
371: VecGetArray(xx,&x);
372: t = a->solve_work;
374: ISGetIndices(isrow,&rout); r = rout;
375: ISGetIndices(iscol,&cout); c = cout;
377: /* forward solve the lower triangular */
378: idx = 6*r[0];
379: t[0] = b[idx]; t[1] = b[1+idx];
380: t[2] = b[2+idx]; t[3] = b[3+idx];
381: t[4] = b[4+idx]; t[5] = b[5+idx];
382: for (i=1; i<n; i++) {
383: v = aa + 36*ai[i];
384: vi = aj + ai[i];
385: nz = ai[i+1] - ai[i];
386: idx = 6*r[i];
387: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
388: s5 = b[4+idx]; s6 = b[5+idx];
389: for (m=0; m<nz; m++) {
390: idx = 6*vi[m];
391: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
392: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
393: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
394: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
395: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
396: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
397: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
398: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
399: v += 36;
400: }
401: idx = 6*i;
402: t[idx] = s1;t[1+idx] = s2;
403: t[2+idx] = s3;t[3+idx] = s4;
404: t[4+idx] = s5;t[5+idx] = s6;
405: }
406: /* backward solve the upper triangular */
407: for (i=n-1; i>=0; i--) {
408: v = aa + 36*(adiag[i+1]+1);
409: vi = aj + adiag[i+1]+1;
410: nz = adiag[i] - adiag[i+1] - 1;
411: idt = 6*i;
412: s1 = t[idt]; s2 = t[1+idt];
413: s3 = t[2+idt];s4 = t[3+idt];
414: s5 = t[4+idt];s6 = t[5+idt];
415: for (m=0; m<nz; m++) {
416: idx = 6*vi[m];
417: x1 = t[idx]; x2 = t[1+idx];
418: x3 = t[2+idx]; x4 = t[3+idx];
419: x5 = t[4+idx]; x6 = t[5+idx];
420: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
421: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
422: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
423: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
424: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
425: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
426: v += 36;
427: }
428: idc = 6*c[i];
429: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
430: v[18]*s4+v[24]*s5+v[30]*s6;
431: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
432: v[19]*s4+v[25]*s5+v[31]*s6;
433: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
434: v[20]*s4+v[26]*s5+v[32]*s6;
435: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
436: v[21]*s4+v[27]*s5+v[33]*s6;
437: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
438: v[22]*s4+v[28]*s5+v[34]*s6;
439: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
440: v[23]*s4+v[29]*s5+v[35]*s6;
441: }
443: ISRestoreIndices(isrow,&rout);
444: ISRestoreIndices(iscol,&cout);
445: VecRestoreArrayRead(bb,&b);
446: VecRestoreArray(xx,&x);
447: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
448: return(0);
449: }
451: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
452: {
453: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
454: IS iscol=a->col,isrow=a->row;
455: PetscErrorCode ierr;
456: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
457: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
458: PetscInt i,nz,idx,idt,idc;
459: const MatScalar *aa=a->a,*v;
460: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
461: const PetscScalar *b;
464: VecGetArrayRead(bb,&b);
465: VecGetArray(xx,&x);
466: t = a->solve_work;
468: ISGetIndices(isrow,&rout); r = rout;
469: ISGetIndices(iscol,&cout); c = cout + (n-1);
471: /* forward solve the lower triangular */
472: idx = 5*(*r++);
473: t[0] = b[idx]; t[1] = b[1+idx];
474: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
475: for (i=1; i<n; i++) {
476: v = aa + 25*ai[i];
477: vi = aj + ai[i];
478: nz = diag[i] - ai[i];
479: idx = 5*(*r++);
480: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
481: s5 = b[4+idx];
482: while (nz--) {
483: idx = 5*(*vi++);
484: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
485: x4 = t[3+idx];x5 = t[4+idx];
486: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
487: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
488: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
489: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
490: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
491: v += 25;
492: }
493: idx = 5*i;
494: t[idx] = s1;t[1+idx] = s2;
495: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
496: }
497: /* backward solve the upper triangular */
498: for (i=n-1; i>=0; i--) {
499: v = aa + 25*diag[i] + 25;
500: vi = aj + diag[i] + 1;
501: nz = ai[i+1] - diag[i] - 1;
502: idt = 5*i;
503: s1 = t[idt]; s2 = t[1+idt];
504: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
505: while (nz--) {
506: idx = 5*(*vi++);
507: x1 = t[idx]; x2 = t[1+idx];
508: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
509: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
510: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
511: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
512: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
513: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
514: v += 25;
515: }
516: idc = 5*(*c--);
517: v = aa + 25*diag[i];
518: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
519: v[15]*s4+v[20]*s5;
520: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
521: v[16]*s4+v[21]*s5;
522: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
523: v[17]*s4+v[22]*s5;
524: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
525: v[18]*s4+v[23]*s5;
526: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
527: v[19]*s4+v[24]*s5;
528: }
530: ISRestoreIndices(isrow,&rout);
531: ISRestoreIndices(iscol,&cout);
532: VecRestoreArrayRead(bb,&b);
533: VecRestoreArray(xx,&x);
534: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
535: return(0);
536: }
538: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
539: {
540: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
541: IS iscol=a->col,isrow=a->row;
542: PetscErrorCode ierr;
543: const PetscInt *r,*c,*rout,*cout;
544: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
545: PetscInt i,nz,idx,idt,idc,m;
546: const MatScalar *aa=a->a,*v;
547: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
548: const PetscScalar *b;
551: VecGetArrayRead(bb,&b);
552: VecGetArray(xx,&x);
553: t = a->solve_work;
555: ISGetIndices(isrow,&rout); r = rout;
556: ISGetIndices(iscol,&cout); c = cout;
558: /* forward solve the lower triangular */
559: idx = 5*r[0];
560: t[0] = b[idx]; t[1] = b[1+idx];
561: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
562: for (i=1; i<n; i++) {
563: v = aa + 25*ai[i];
564: vi = aj + ai[i];
565: nz = ai[i+1] - ai[i];
566: idx = 5*r[i];
567: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
568: s5 = b[4+idx];
569: for (m=0; m<nz; m++) {
570: idx = 5*vi[m];
571: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
572: x4 = t[3+idx];x5 = t[4+idx];
573: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
574: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
575: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
576: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
577: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
578: v += 25;
579: }
580: idx = 5*i;
581: t[idx] = s1;t[1+idx] = s2;
582: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
583: }
584: /* backward solve the upper triangular */
585: for (i=n-1; i>=0; i--) {
586: v = aa + 25*(adiag[i+1]+1);
587: vi = aj + adiag[i+1]+1;
588: nz = adiag[i] - adiag[i+1] - 1;
589: idt = 5*i;
590: s1 = t[idt]; s2 = t[1+idt];
591: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
592: for (m=0; m<nz; m++) {
593: idx = 5*vi[m];
594: x1 = t[idx]; x2 = t[1+idx];
595: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
596: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
597: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
598: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
599: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
600: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
601: v += 25;
602: }
603: idc = 5*c[i];
604: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
605: v[15]*s4+v[20]*s5;
606: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
607: v[16]*s4+v[21]*s5;
608: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
609: v[17]*s4+v[22]*s5;
610: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
611: v[18]*s4+v[23]*s5;
612: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
613: v[19]*s4+v[24]*s5;
614: }
616: ISRestoreIndices(isrow,&rout);
617: ISRestoreIndices(iscol,&cout);
618: VecRestoreArrayRead(bb,&b);
619: VecRestoreArray(xx,&x);
620: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
621: return(0);
622: }
624: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
625: {
626: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
627: IS iscol=a->col,isrow=a->row;
628: PetscErrorCode ierr;
629: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
630: PetscInt i,nz,idx,idt,idc;
631: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
632: const MatScalar *aa=a->a,*v;
633: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
634: const PetscScalar *b;
637: VecGetArrayRead(bb,&b);
638: VecGetArray(xx,&x);
639: t = a->solve_work;
641: ISGetIndices(isrow,&rout); r = rout;
642: ISGetIndices(iscol,&cout); c = cout + (n-1);
644: /* forward solve the lower triangular */
645: idx = 4*(*r++);
646: t[0] = b[idx]; t[1] = b[1+idx];
647: t[2] = b[2+idx]; t[3] = b[3+idx];
648: for (i=1; i<n; i++) {
649: v = aa + 16*ai[i];
650: vi = aj + ai[i];
651: nz = diag[i] - ai[i];
652: idx = 4*(*r++);
653: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
654: while (nz--) {
655: idx = 4*(*vi++);
656: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
657: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
658: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
659: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
660: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
661: v += 16;
662: }
663: idx = 4*i;
664: t[idx] = s1;t[1+idx] = s2;
665: t[2+idx] = s3;t[3+idx] = s4;
666: }
667: /* backward solve the upper triangular */
668: for (i=n-1; i>=0; i--) {
669: v = aa + 16*diag[i] + 16;
670: vi = aj + diag[i] + 1;
671: nz = ai[i+1] - diag[i] - 1;
672: idt = 4*i;
673: s1 = t[idt]; s2 = t[1+idt];
674: s3 = t[2+idt];s4 = t[3+idt];
675: while (nz--) {
676: idx = 4*(*vi++);
677: x1 = t[idx]; x2 = t[1+idx];
678: x3 = t[2+idx]; x4 = t[3+idx];
679: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
680: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
681: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
682: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
683: v += 16;
684: }
685: idc = 4*(*c--);
686: v = aa + 16*diag[i];
687: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
688: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
689: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
690: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
691: }
693: ISRestoreIndices(isrow,&rout);
694: ISRestoreIndices(iscol,&cout);
695: VecRestoreArrayRead(bb,&b);
696: VecRestoreArray(xx,&x);
697: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
698: return(0);
699: }
701: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
702: {
703: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
704: IS iscol=a->col,isrow=a->row;
705: PetscErrorCode ierr;
706: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
707: PetscInt i,nz,idx,idt,idc,m;
708: const PetscInt *r,*c,*rout,*cout;
709: const MatScalar *aa=a->a,*v;
710: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
711: const PetscScalar *b;
714: VecGetArrayRead(bb,&b);
715: VecGetArray(xx,&x);
716: t = a->solve_work;
718: ISGetIndices(isrow,&rout); r = rout;
719: ISGetIndices(iscol,&cout); c = cout;
721: /* forward solve the lower triangular */
722: idx = 4*r[0];
723: t[0] = b[idx]; t[1] = b[1+idx];
724: t[2] = b[2+idx]; t[3] = b[3+idx];
725: for (i=1; i<n; i++) {
726: v = aa + 16*ai[i];
727: vi = aj + ai[i];
728: nz = ai[i+1] - ai[i];
729: idx = 4*r[i];
730: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
731: for (m=0; m<nz; m++) {
732: idx = 4*vi[m];
733: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
734: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
735: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
736: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
737: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
738: v += 16;
739: }
740: idx = 4*i;
741: t[idx] = s1;t[1+idx] = s2;
742: t[2+idx] = s3;t[3+idx] = s4;
743: }
744: /* backward solve the upper triangular */
745: for (i=n-1; i>=0; i--) {
746: v = aa + 16*(adiag[i+1]+1);
747: vi = aj + adiag[i+1]+1;
748: nz = adiag[i] - adiag[i+1] - 1;
749: idt = 4*i;
750: s1 = t[idt]; s2 = t[1+idt];
751: s3 = t[2+idt];s4 = t[3+idt];
752: for (m=0; m<nz; m++) {
753: idx = 4*vi[m];
754: x1 = t[idx]; x2 = t[1+idx];
755: x3 = t[2+idx]; x4 = t[3+idx];
756: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
757: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
758: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
759: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
760: v += 16;
761: }
762: idc = 4*c[i];
763: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
764: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
765: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
766: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
767: }
769: ISRestoreIndices(isrow,&rout);
770: ISRestoreIndices(iscol,&cout);
771: VecRestoreArrayRead(bb,&b);
772: VecRestoreArray(xx,&x);
773: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
774: return(0);
775: }
777: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
778: {
779: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
780: IS iscol=a->col,isrow=a->row;
781: PetscErrorCode ierr;
782: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
783: PetscInt i,nz,idx,idt,idc;
784: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
785: const MatScalar *aa=a->a,*v;
786: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
787: PetscScalar *x;
788: const PetscScalar *b;
791: VecGetArrayRead(bb,&b);
792: VecGetArray(xx,&x);
793: t = (MatScalar*)a->solve_work;
795: ISGetIndices(isrow,&rout); r = rout;
796: ISGetIndices(iscol,&cout); c = cout + (n-1);
798: /* forward solve the lower triangular */
799: idx = 4*(*r++);
800: t[0] = (MatScalar)b[idx];
801: t[1] = (MatScalar)b[1+idx];
802: t[2] = (MatScalar)b[2+idx];
803: t[3] = (MatScalar)b[3+idx];
804: for (i=1; i<n; i++) {
805: v = aa + 16*ai[i];
806: vi = aj + ai[i];
807: nz = diag[i] - ai[i];
808: idx = 4*(*r++);
809: s1 = (MatScalar)b[idx];
810: s2 = (MatScalar)b[1+idx];
811: s3 = (MatScalar)b[2+idx];
812: s4 = (MatScalar)b[3+idx];
813: while (nz--) {
814: idx = 4*(*vi++);
815: x1 = t[idx];
816: x2 = t[1+idx];
817: x3 = t[2+idx];
818: x4 = t[3+idx];
819: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
820: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
821: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
822: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
823: v += 16;
824: }
825: idx = 4*i;
826: t[idx] = s1;
827: t[1+idx] = s2;
828: t[2+idx] = s3;
829: t[3+idx] = s4;
830: }
831: /* backward solve the upper triangular */
832: for (i=n-1; i>=0; i--) {
833: v = aa + 16*diag[i] + 16;
834: vi = aj + diag[i] + 1;
835: nz = ai[i+1] - diag[i] - 1;
836: idt = 4*i;
837: s1 = t[idt];
838: s2 = t[1+idt];
839: s3 = t[2+idt];
840: s4 = t[3+idt];
841: while (nz--) {
842: idx = 4*(*vi++);
843: x1 = t[idx];
844: x2 = t[1+idx];
845: x3 = t[2+idx];
846: x4 = t[3+idx];
847: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
848: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
849: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
850: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
851: v += 16;
852: }
853: idc = 4*(*c--);
854: v = aa + 16*diag[i];
855: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
856: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
857: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
858: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
859: x[idc] = (PetscScalar)t[idt];
860: x[1+idc] = (PetscScalar)t[1+idt];
861: x[2+idc] = (PetscScalar)t[2+idt];
862: x[3+idc] = (PetscScalar)t[3+idt];
863: }
865: ISRestoreIndices(isrow,&rout);
866: ISRestoreIndices(iscol,&cout);
867: VecRestoreArrayRead(bb,&b);
868: VecRestoreArray(xx,&x);
869: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
870: return(0);
871: }
873: #if defined(PETSC_HAVE_SSE)
875: #include PETSC_HAVE_SSE
877: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
878: {
879: /*
880: Note: This code uses demotion of double
881: to float when performing the mixed-mode computation.
882: This may not be numerically reasonable for all applications.
883: */
884: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
885: IS iscol=a->col,isrow=a->row;
887: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
888: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
889: MatScalar *aa=a->a,*v;
890: PetscScalar *x,*b,*t;
892: /* Make space in temp stack for 16 Byte Aligned arrays */
893: float ssealignedspace[11],*tmps,*tmpx;
894: unsigned long offset;
897: SSE_SCOPE_BEGIN;
899: offset = (unsigned long)ssealignedspace % 16;
900: if (offset) offset = (16 - offset)/4;
901: tmps = &ssealignedspace[offset];
902: tmpx = &ssealignedspace[offset+4];
903: PREFETCH_NTA(aa+16*ai[1]);
905: VecGetArray(bb,&b);
906: VecGetArray(xx,&x);
907: t = a->solve_work;
909: ISGetIndices(isrow,&rout); r = rout;
910: ISGetIndices(iscol,&cout); c = cout + (n-1);
912: /* forward solve the lower triangular */
913: idx = 4*(*r++);
914: t[0] = b[idx]; t[1] = b[1+idx];
915: t[2] = b[2+idx]; t[3] = b[3+idx];
916: v = aa + 16*ai[1];
918: for (i=1; i<n; ) {
919: PREFETCH_NTA(&v[8]);
920: vi = aj + ai[i];
921: nz = diag[i] - ai[i];
922: idx = 4*(*r++);
924: /* Demote sum from double to float */
925: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
926: LOAD_PS(tmps,XMM7);
928: while (nz--) {
929: PREFETCH_NTA(&v[16]);
930: idx = 4*(*vi++);
932: /* Demote solution (so far) from double to float */
933: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
935: /* 4x4 Matrix-Vector product with negative accumulation: */
936: SSE_INLINE_BEGIN_2(tmpx,v)
937: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
939: /* First Column */
940: SSE_COPY_PS(XMM0,XMM6)
941: SSE_SHUFFLE(XMM0,XMM0,0x00)
942: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
943: SSE_SUB_PS(XMM7,XMM0)
945: /* Second Column */
946: SSE_COPY_PS(XMM1,XMM6)
947: SSE_SHUFFLE(XMM1,XMM1,0x55)
948: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
949: SSE_SUB_PS(XMM7,XMM1)
951: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
953: /* Third Column */
954: SSE_COPY_PS(XMM2,XMM6)
955: SSE_SHUFFLE(XMM2,XMM2,0xAA)
956: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
957: SSE_SUB_PS(XMM7,XMM2)
959: /* Fourth Column */
960: SSE_COPY_PS(XMM3,XMM6)
961: SSE_SHUFFLE(XMM3,XMM3,0xFF)
962: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
963: SSE_SUB_PS(XMM7,XMM3)
964: SSE_INLINE_END_2
966: v += 16;
967: }
968: idx = 4*i;
969: v = aa + 16*ai[++i];
970: PREFETCH_NTA(v);
971: STORE_PS(tmps,XMM7);
973: /* Promote result from float to double */
974: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
975: }
976: /* backward solve the upper triangular */
977: idt = 4*(n-1);
978: ai16 = 16*diag[n-1];
979: v = aa + ai16 + 16;
980: for (i=n-1; i>=0; ) {
981: PREFETCH_NTA(&v[8]);
982: vi = aj + diag[i] + 1;
983: nz = ai[i+1] - diag[i] - 1;
985: /* Demote accumulator from double to float */
986: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
987: LOAD_PS(tmps,XMM7);
989: while (nz--) {
990: PREFETCH_NTA(&v[16]);
991: idx = 4*(*vi++);
993: /* Demote solution (so far) from double to float */
994: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
996: /* 4x4 Matrix-Vector Product with negative accumulation: */
997: SSE_INLINE_BEGIN_2(tmpx,v)
998: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1000: /* First Column */
1001: SSE_COPY_PS(XMM0,XMM6)
1002: SSE_SHUFFLE(XMM0,XMM0,0x00)
1003: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1004: SSE_SUB_PS(XMM7,XMM0)
1006: /* Second Column */
1007: SSE_COPY_PS(XMM1,XMM6)
1008: SSE_SHUFFLE(XMM1,XMM1,0x55)
1009: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1010: SSE_SUB_PS(XMM7,XMM1)
1012: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1014: /* Third Column */
1015: SSE_COPY_PS(XMM2,XMM6)
1016: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1017: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1018: SSE_SUB_PS(XMM7,XMM2)
1020: /* Fourth Column */
1021: SSE_COPY_PS(XMM3,XMM6)
1022: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1023: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1024: SSE_SUB_PS(XMM7,XMM3)
1025: SSE_INLINE_END_2
1026: v += 16;
1027: }
1028: v = aa + ai16;
1029: ai16 = 16*diag[--i];
1030: PREFETCH_NTA(aa+ai16+16);
1031: /*
1032: Scale the result by the diagonal 4x4 block,
1033: which was inverted as part of the factorization
1034: */
1035: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1036: /* First Column */
1037: SSE_COPY_PS(XMM0,XMM7)
1038: SSE_SHUFFLE(XMM0,XMM0,0x00)
1039: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1041: /* Second Column */
1042: SSE_COPY_PS(XMM1,XMM7)
1043: SSE_SHUFFLE(XMM1,XMM1,0x55)
1044: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1045: SSE_ADD_PS(XMM0,XMM1)
1047: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1049: /* Third Column */
1050: SSE_COPY_PS(XMM2,XMM7)
1051: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1052: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1053: SSE_ADD_PS(XMM0,XMM2)
1055: /* Fourth Column */
1056: SSE_COPY_PS(XMM3,XMM7)
1057: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1058: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1059: SSE_ADD_PS(XMM0,XMM3)
1061: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1062: SSE_INLINE_END_3
1064: /* Promote solution from float to double */
1065: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1067: /* Apply reordering to t and stream into x. */
1068: /* This way, x doesn't pollute the cache. */
1069: /* Be careful with size: 2 doubles = 4 floats! */
1070: idc = 4*(*c--);
1071: SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1072: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1073: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1074: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1075: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1076: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1077: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1078: SSE_INLINE_END_2
1079: v = aa + ai16 + 16;
1080: idt -= 4;
1081: }
1083: ISRestoreIndices(isrow,&rout);
1084: ISRestoreIndices(iscol,&cout);
1085: VecRestoreArray(bb,&b);
1086: VecRestoreArray(xx,&x);
1087: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1088: SSE_SCOPE_END;
1089: return(0);
1090: }
1092: #endif
1094: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1095: {
1096: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1097: IS iscol=a->col,isrow=a->row;
1098: PetscErrorCode ierr;
1099: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1100: PetscInt i,nz,idx,idt,idc;
1101: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1102: const MatScalar *aa=a->a,*v;
1103: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1104: const PetscScalar *b;
1107: VecGetArrayRead(bb,&b);
1108: VecGetArray(xx,&x);
1109: t = a->solve_work;
1111: ISGetIndices(isrow,&rout); r = rout;
1112: ISGetIndices(iscol,&cout); c = cout + (n-1);
1114: /* forward solve the lower triangular */
1115: idx = 3*(*r++);
1116: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1117: for (i=1; i<n; i++) {
1118: v = aa + 9*ai[i];
1119: vi = aj + ai[i];
1120: nz = diag[i] - ai[i];
1121: idx = 3*(*r++);
1122: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1123: while (nz--) {
1124: idx = 3*(*vi++);
1125: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1126: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1127: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1128: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1129: v += 9;
1130: }
1131: idx = 3*i;
1132: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1133: }
1134: /* backward solve the upper triangular */
1135: for (i=n-1; i>=0; i--) {
1136: v = aa + 9*diag[i] + 9;
1137: vi = aj + diag[i] + 1;
1138: nz = ai[i+1] - diag[i] - 1;
1139: idt = 3*i;
1140: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1141: while (nz--) {
1142: idx = 3*(*vi++);
1143: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1144: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1145: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1146: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1147: v += 9;
1148: }
1149: idc = 3*(*c--);
1150: v = aa + 9*diag[i];
1151: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1152: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1153: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1154: }
1155: ISRestoreIndices(isrow,&rout);
1156: ISRestoreIndices(iscol,&cout);
1157: VecRestoreArrayRead(bb,&b);
1158: VecRestoreArray(xx,&x);
1159: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1160: return(0);
1161: }
1163: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1164: {
1165: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1166: IS iscol=a->col,isrow=a->row;
1167: PetscErrorCode ierr;
1168: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1169: PetscInt i,nz,idx,idt,idc,m;
1170: const PetscInt *r,*c,*rout,*cout;
1171: const MatScalar *aa=a->a,*v;
1172: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1173: const PetscScalar *b;
1176: VecGetArrayRead(bb,&b);
1177: VecGetArray(xx,&x);
1178: t = a->solve_work;
1180: ISGetIndices(isrow,&rout); r = rout;
1181: ISGetIndices(iscol,&cout); c = cout;
1183: /* forward solve the lower triangular */
1184: idx = 3*r[0];
1185: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1186: for (i=1; i<n; i++) {
1187: v = aa + 9*ai[i];
1188: vi = aj + ai[i];
1189: nz = ai[i+1] - ai[i];
1190: idx = 3*r[i];
1191: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1192: for (m=0; m<nz; m++) {
1193: idx = 3*vi[m];
1194: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1195: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1196: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1197: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1198: v += 9;
1199: }
1200: idx = 3*i;
1201: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1202: }
1203: /* backward solve the upper triangular */
1204: for (i=n-1; i>=0; i--) {
1205: v = aa + 9*(adiag[i+1]+1);
1206: vi = aj + adiag[i+1]+1;
1207: nz = adiag[i] - adiag[i+1] - 1;
1208: idt = 3*i;
1209: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1210: for (m=0; m<nz; m++) {
1211: idx = 3*vi[m];
1212: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1213: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1214: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1215: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1216: v += 9;
1217: }
1218: idc = 3*c[i];
1219: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1220: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1221: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1222: }
1223: ISRestoreIndices(isrow,&rout);
1224: ISRestoreIndices(iscol,&cout);
1225: VecRestoreArrayRead(bb,&b);
1226: VecRestoreArray(xx,&x);
1227: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1228: return(0);
1229: }
1231: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1232: {
1233: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1234: IS iscol=a->col,isrow=a->row;
1235: PetscErrorCode ierr;
1236: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1237: PetscInt i,nz,idx,idt,idc;
1238: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1239: const MatScalar *aa=a->a,*v;
1240: PetscScalar *x,s1,s2,x1,x2,*t;
1241: const PetscScalar *b;
1244: VecGetArrayRead(bb,&b);
1245: VecGetArray(xx,&x);
1246: t = a->solve_work;
1248: ISGetIndices(isrow,&rout); r = rout;
1249: ISGetIndices(iscol,&cout); c = cout + (n-1);
1251: /* forward solve the lower triangular */
1252: idx = 2*(*r++);
1253: t[0] = b[idx]; t[1] = b[1+idx];
1254: for (i=1; i<n; i++) {
1255: v = aa + 4*ai[i];
1256: vi = aj + ai[i];
1257: nz = diag[i] - ai[i];
1258: idx = 2*(*r++);
1259: s1 = b[idx]; s2 = b[1+idx];
1260: while (nz--) {
1261: idx = 2*(*vi++);
1262: x1 = t[idx]; x2 = t[1+idx];
1263: s1 -= v[0]*x1 + v[2]*x2;
1264: s2 -= v[1]*x1 + v[3]*x2;
1265: v += 4;
1266: }
1267: idx = 2*i;
1268: t[idx] = s1; t[1+idx] = s2;
1269: }
1270: /* backward solve the upper triangular */
1271: for (i=n-1; i>=0; i--) {
1272: v = aa + 4*diag[i] + 4;
1273: vi = aj + diag[i] + 1;
1274: nz = ai[i+1] - diag[i] - 1;
1275: idt = 2*i;
1276: s1 = t[idt]; s2 = t[1+idt];
1277: while (nz--) {
1278: idx = 2*(*vi++);
1279: x1 = t[idx]; x2 = t[1+idx];
1280: s1 -= v[0]*x1 + v[2]*x2;
1281: s2 -= v[1]*x1 + v[3]*x2;
1282: v += 4;
1283: }
1284: idc = 2*(*c--);
1285: v = aa + 4*diag[i];
1286: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1287: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1288: }
1289: ISRestoreIndices(isrow,&rout);
1290: ISRestoreIndices(iscol,&cout);
1291: VecRestoreArrayRead(bb,&b);
1292: VecRestoreArray(xx,&x);
1293: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1294: return(0);
1295: }
1297: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1298: {
1299: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1300: IS iscol=a->col,isrow=a->row;
1301: PetscErrorCode ierr;
1302: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1303: PetscInt i,nz,idx,jdx,idt,idc,m;
1304: const PetscInt *r,*c,*rout,*cout;
1305: const MatScalar *aa=a->a,*v;
1306: PetscScalar *x,s1,s2,x1,x2,*t;
1307: const PetscScalar *b;
1310: VecGetArrayRead(bb,&b);
1311: VecGetArray(xx,&x);
1312: t = a->solve_work;
1314: ISGetIndices(isrow,&rout); r = rout;
1315: ISGetIndices(iscol,&cout); c = cout;
1317: /* forward solve the lower triangular */
1318: idx = 2*r[0];
1319: t[0] = b[idx]; t[1] = b[1+idx];
1320: for (i=1; i<n; i++) {
1321: v = aa + 4*ai[i];
1322: vi = aj + ai[i];
1323: nz = ai[i+1] - ai[i];
1324: idx = 2*r[i];
1325: s1 = b[idx]; s2 = b[1+idx];
1326: for (m=0; m<nz; m++) {
1327: jdx = 2*vi[m];
1328: x1 = t[jdx]; x2 = t[1+jdx];
1329: s1 -= v[0]*x1 + v[2]*x2;
1330: s2 -= v[1]*x1 + v[3]*x2;
1331: v += 4;
1332: }
1333: idx = 2*i;
1334: t[idx] = s1; t[1+idx] = s2;
1335: }
1336: /* backward solve the upper triangular */
1337: for (i=n-1; i>=0; i--) {
1338: v = aa + 4*(adiag[i+1]+1);
1339: vi = aj + adiag[i+1]+1;
1340: nz = adiag[i] - adiag[i+1] - 1;
1341: idt = 2*i;
1342: s1 = t[idt]; s2 = t[1+idt];
1343: for (m=0; m<nz; m++) {
1344: idx = 2*vi[m];
1345: x1 = t[idx]; x2 = t[1+idx];
1346: s1 -= v[0]*x1 + v[2]*x2;
1347: s2 -= v[1]*x1 + v[3]*x2;
1348: v += 4;
1349: }
1350: idc = 2*c[i];
1351: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1352: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1353: }
1354: ISRestoreIndices(isrow,&rout);
1355: ISRestoreIndices(iscol,&cout);
1356: VecRestoreArrayRead(bb,&b);
1357: VecRestoreArray(xx,&x);
1358: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1359: return(0);
1360: }
1362: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1363: {
1364: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1365: IS iscol=a->col,isrow=a->row;
1366: PetscErrorCode ierr;
1367: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1368: PetscInt i,nz;
1369: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1370: const MatScalar *aa=a->a,*v;
1371: PetscScalar *x,s1,*t;
1372: const PetscScalar *b;
1375: if (!n) return(0);
1377: VecGetArrayRead(bb,&b);
1378: VecGetArray(xx,&x);
1379: t = a->solve_work;
1381: ISGetIndices(isrow,&rout); r = rout;
1382: ISGetIndices(iscol,&cout); c = cout + (n-1);
1384: /* forward solve the lower triangular */
1385: t[0] = b[*r++];
1386: for (i=1; i<n; i++) {
1387: v = aa + ai[i];
1388: vi = aj + ai[i];
1389: nz = diag[i] - ai[i];
1390: s1 = b[*r++];
1391: while (nz--) {
1392: s1 -= (*v++)*t[*vi++];
1393: }
1394: t[i] = s1;
1395: }
1396: /* backward solve the upper triangular */
1397: for (i=n-1; i>=0; i--) {
1398: v = aa + diag[i] + 1;
1399: vi = aj + diag[i] + 1;
1400: nz = ai[i+1] - diag[i] - 1;
1401: s1 = t[i];
1402: while (nz--) {
1403: s1 -= (*v++)*t[*vi++];
1404: }
1405: x[*c--] = t[i] = aa[diag[i]]*s1;
1406: }
1408: ISRestoreIndices(isrow,&rout);
1409: ISRestoreIndices(iscol,&cout);
1410: VecRestoreArrayRead(bb,&b);
1411: VecRestoreArray(xx,&x);
1412: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1413: return(0);
1414: }
1416: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1417: {
1418: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1419: IS iscol = a->col,isrow = a->row;
1420: PetscErrorCode ierr;
1421: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1422: const PetscInt *rout,*cout,*r,*c;
1423: PetscScalar *x,*tmp,sum;
1424: const PetscScalar *b;
1425: const MatScalar *aa = a->a,*v;
1428: if (!n) return(0);
1430: VecGetArrayRead(bb,&b);
1431: VecGetArray(xx,&x);
1432: tmp = a->solve_work;
1434: ISGetIndices(isrow,&rout); r = rout;
1435: ISGetIndices(iscol,&cout); c = cout;
1437: /* forward solve the lower triangular */
1438: tmp[0] = b[r[0]];
1439: v = aa;
1440: vi = aj;
1441: for (i=1; i<n; i++) {
1442: nz = ai[i+1] - ai[i];
1443: sum = b[r[i]];
1444: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1445: tmp[i] = sum;
1446: v += nz; vi += nz;
1447: }
1449: /* backward solve the upper triangular */
1450: for (i=n-1; i>=0; i--) {
1451: v = aa + adiag[i+1]+1;
1452: vi = aj + adiag[i+1]+1;
1453: nz = adiag[i]-adiag[i+1]-1;
1454: sum = tmp[i];
1455: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1456: x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1457: }
1459: ISRestoreIndices(isrow,&rout);
1460: ISRestoreIndices(iscol,&cout);
1461: VecRestoreArrayRead(bb,&b);
1462: VecRestoreArray(xx,&x);
1463: PetscLogFlops(2.0*a->nz - A->cmap->n);
1464: return(0);
1465: }