Actual source code: baijsolv.c
petsc-3.7.3 2016-08-01
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
6: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
7: {
8: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
9: IS iscol=a->col,isrow=a->row;
10: PetscErrorCode ierr;
11: const PetscInt *r,*c,*rout,*cout;
12: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
13: PetscInt i,nz;
14: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
15: const MatScalar *aa=a->a,*v;
16: PetscScalar *x,*s,*t,*ls;
17: const PetscScalar *b;
20: VecGetArrayRead(bb,&b);
21: VecGetArray(xx,&x);
22: t = a->solve_work;
24: ISGetIndices(isrow,&rout); r = rout;
25: ISGetIndices(iscol,&cout); c = cout + (n-1);
27: /* forward solve the lower triangular */
28: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
29: for (i=1; i<n; i++) {
30: v = aa + bs2*ai[i];
31: vi = aj + ai[i];
32: nz = a->diag[i] - ai[i];
33: s = t + bs*i;
34: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
35: while (nz--) {
36: PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
37: v += bs2;
38: }
39: }
40: /* backward solve the upper triangular */
41: ls = a->solve_work + A->cmap->n;
42: for (i=n-1; i>=0; i--) {
43: v = aa + bs2*(a->diag[i] + 1);
44: vi = aj + a->diag[i] + 1;
45: nz = ai[i+1] - a->diag[i] - 1;
46: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
47: while (nz--) {
48: PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
49: v += bs2;
50: }
51: PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
52: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
53: }
55: ISRestoreIndices(isrow,&rout);
56: ISRestoreIndices(iscol,&cout);
57: VecRestoreArrayRead(bb,&b);
58: VecRestoreArray(xx,&x);
59: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
60: return(0);
61: }
65: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
66: {
67: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
68: IS iscol=a->col,isrow=a->row;
69: PetscErrorCode ierr;
70: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
71: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
72: PetscInt i,nz,idx,idt,idc;
73: const MatScalar *aa=a->a,*v;
74: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
75: const PetscScalar *b;
78: VecGetArrayRead(bb,&b);
79: VecGetArray(xx,&x);
80: t = a->solve_work;
82: ISGetIndices(isrow,&rout); r = rout;
83: ISGetIndices(iscol,&cout); c = cout + (n-1);
85: /* forward solve the lower triangular */
86: idx = 7*(*r++);
87: t[0] = b[idx]; t[1] = b[1+idx];
88: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
89: t[5] = b[5+idx]; t[6] = b[6+idx];
91: for (i=1; i<n; i++) {
92: v = aa + 49*ai[i];
93: vi = aj + ai[i];
94: nz = diag[i] - ai[i];
95: idx = 7*(*r++);
96: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
97: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
98: while (nz--) {
99: idx = 7*(*vi++);
100: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
101: x4 = t[3+idx];x5 = t[4+idx];
102: x6 = t[5+idx];x7 = t[6+idx];
103: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
104: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
105: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
106: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
107: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
108: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
109: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
110: v += 49;
111: }
112: idx = 7*i;
113: t[idx] = s1;t[1+idx] = s2;
114: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
115: t[5+idx] = s6;t[6+idx] = s7;
116: }
117: /* backward solve the upper triangular */
118: for (i=n-1; i>=0; i--) {
119: v = aa + 49*diag[i] + 49;
120: vi = aj + diag[i] + 1;
121: nz = ai[i+1] - diag[i] - 1;
122: idt = 7*i;
123: s1 = t[idt]; s2 = t[1+idt];
124: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
125: s6 = t[5+idt];s7 = t[6+idt];
126: while (nz--) {
127: idx = 7*(*vi++);
128: x1 = t[idx]; x2 = t[1+idx];
129: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
130: x6 = t[5+idx]; x7 = t[6+idx];
131: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
132: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
133: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
134: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
135: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
136: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
137: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
138: v += 49;
139: }
140: idc = 7*(*c--);
141: v = aa + 49*diag[i];
142: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
143: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
144: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
145: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
146: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
147: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
148: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
149: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
150: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
151: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
152: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
153: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
154: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
155: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
156: }
158: ISRestoreIndices(isrow,&rout);
159: ISRestoreIndices(iscol,&cout);
160: VecRestoreArrayRead(bb,&b);
161: VecRestoreArray(xx,&x);
162: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
163: return(0);
164: }
168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
169: {
170: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
171: IS iscol=a->col,isrow=a->row;
172: PetscErrorCode ierr;
173: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
174: const PetscInt n=a->mbs,*rout,*cout,*vi;
175: PetscInt i,nz,idx,idt,idc,m;
176: const MatScalar *aa=a->a,*v;
177: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
178: const PetscScalar *b;
181: VecGetArrayRead(bb,&b);
182: VecGetArray(xx,&x);
183: t = a->solve_work;
185: ISGetIndices(isrow,&rout); r = rout;
186: ISGetIndices(iscol,&cout); c = cout;
188: /* forward solve the lower triangular */
189: idx = 7*r[0];
190: t[0] = b[idx]; t[1] = b[1+idx];
191: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
192: t[5] = b[5+idx]; t[6] = b[6+idx];
194: for (i=1; i<n; i++) {
195: v = aa + 49*ai[i];
196: vi = aj + ai[i];
197: nz = ai[i+1] - ai[i];
198: idx = 7*r[i];
199: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
200: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
201: for (m=0; m<nz; m++) {
202: idx = 7*vi[m];
203: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
204: x4 = t[3+idx];x5 = t[4+idx];
205: x6 = t[5+idx];x7 = t[6+idx];
206: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
207: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
208: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
209: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
210: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
211: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
212: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
213: v += 49;
214: }
215: idx = 7*i;
216: t[idx] = s1;t[1+idx] = s2;
217: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
218: t[5+idx] = s6;t[6+idx] = s7;
219: }
220: /* backward solve the upper triangular */
221: for (i=n-1; i>=0; i--) {
222: v = aa + 49*(adiag[i+1]+1);
223: vi = aj + adiag[i+1]+1;
224: nz = adiag[i] - adiag[i+1] - 1;
225: idt = 7*i;
226: s1 = t[idt]; s2 = t[1+idt];
227: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
228: s6 = t[5+idt];s7 = t[6+idt];
229: for (m=0; m<nz; m++) {
230: idx = 7*vi[m];
231: x1 = t[idx]; x2 = t[1+idx];
232: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
233: x6 = t[5+idx]; x7 = t[6+idx];
234: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
235: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
236: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
237: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
238: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
239: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
240: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
241: v += 49;
242: }
243: idc = 7*c[i];
244: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
245: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
246: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
247: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
248: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
249: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
250: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
251: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
252: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
253: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
254: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
255: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
256: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
257: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
258: }
260: ISRestoreIndices(isrow,&rout);
261: ISRestoreIndices(iscol,&cout);
262: VecRestoreArrayRead(bb,&b);
263: VecRestoreArray(xx,&x);
264: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
265: return(0);
266: }
270: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
271: {
272: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
273: IS iscol=a->col,isrow=a->row;
274: PetscErrorCode ierr;
275: const PetscInt *r,*c,*rout,*cout;
276: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
277: PetscInt i,nz,idx,idt,idc;
278: const MatScalar *aa=a->a,*v;
279: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
280: const PetscScalar *b;
283: VecGetArrayRead(bb,&b);
284: VecGetArray(xx,&x);
285: t = a->solve_work;
287: ISGetIndices(isrow,&rout); r = rout;
288: ISGetIndices(iscol,&cout); c = cout + (n-1);
290: /* forward solve the lower triangular */
291: idx = 6*(*r++);
292: t[0] = b[idx]; t[1] = b[1+idx];
293: t[2] = b[2+idx]; t[3] = b[3+idx];
294: t[4] = b[4+idx]; t[5] = b[5+idx];
295: for (i=1; i<n; i++) {
296: v = aa + 36*ai[i];
297: vi = aj + ai[i];
298: nz = diag[i] - ai[i];
299: idx = 6*(*r++);
300: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
301: s5 = b[4+idx]; s6 = b[5+idx];
302: while (nz--) {
303: idx = 6*(*vi++);
304: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
305: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
306: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
307: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
308: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
309: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
310: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
311: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
312: v += 36;
313: }
314: idx = 6*i;
315: t[idx] = s1;t[1+idx] = s2;
316: t[2+idx] = s3;t[3+idx] = s4;
317: t[4+idx] = s5;t[5+idx] = s6;
318: }
319: /* backward solve the upper triangular */
320: for (i=n-1; i>=0; i--) {
321: v = aa + 36*diag[i] + 36;
322: vi = aj + diag[i] + 1;
323: nz = ai[i+1] - diag[i] - 1;
324: idt = 6*i;
325: s1 = t[idt]; s2 = t[1+idt];
326: s3 = t[2+idt];s4 = t[3+idt];
327: s5 = t[4+idt];s6 = t[5+idt];
328: while (nz--) {
329: idx = 6*(*vi++);
330: x1 = t[idx]; x2 = t[1+idx];
331: x3 = t[2+idx]; x4 = t[3+idx];
332: x5 = t[4+idx]; x6 = t[5+idx];
333: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
334: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
335: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
336: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
337: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
338: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
339: v += 36;
340: }
341: idc = 6*(*c--);
342: v = aa + 36*diag[i];
343: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
344: v[18]*s4+v[24]*s5+v[30]*s6;
345: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
346: v[19]*s4+v[25]*s5+v[31]*s6;
347: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
348: v[20]*s4+v[26]*s5+v[32]*s6;
349: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
350: v[21]*s4+v[27]*s5+v[33]*s6;
351: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
352: v[22]*s4+v[28]*s5+v[34]*s6;
353: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
354: v[23]*s4+v[29]*s5+v[35]*s6;
355: }
357: ISRestoreIndices(isrow,&rout);
358: ISRestoreIndices(iscol,&cout);
359: VecRestoreArrayRead(bb,&b);
360: VecRestoreArray(xx,&x);
361: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
362: return(0);
363: }
367: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
368: {
369: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
370: IS iscol=a->col,isrow=a->row;
371: PetscErrorCode ierr;
372: const PetscInt *r,*c,*rout,*cout;
373: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
374: PetscInt i,nz,idx,idt,idc,m;
375: const MatScalar *aa=a->a,*v;
376: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
377: const PetscScalar *b;
380: VecGetArrayRead(bb,&b);
381: VecGetArray(xx,&x);
382: t = a->solve_work;
384: ISGetIndices(isrow,&rout); r = rout;
385: ISGetIndices(iscol,&cout); c = cout;
387: /* forward solve the lower triangular */
388: idx = 6*r[0];
389: t[0] = b[idx]; t[1] = b[1+idx];
390: t[2] = b[2+idx]; t[3] = b[3+idx];
391: t[4] = b[4+idx]; t[5] = b[5+idx];
392: for (i=1; i<n; i++) {
393: v = aa + 36*ai[i];
394: vi = aj + ai[i];
395: nz = ai[i+1] - ai[i];
396: idx = 6*r[i];
397: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
398: s5 = b[4+idx]; s6 = b[5+idx];
399: for (m=0; m<nz; m++) {
400: idx = 6*vi[m];
401: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
402: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
403: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
404: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
405: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
406: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
407: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
408: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
409: v += 36;
410: }
411: idx = 6*i;
412: t[idx] = s1;t[1+idx] = s2;
413: t[2+idx] = s3;t[3+idx] = s4;
414: t[4+idx] = s5;t[5+idx] = s6;
415: }
416: /* backward solve the upper triangular */
417: for (i=n-1; i>=0; i--) {
418: v = aa + 36*(adiag[i+1]+1);
419: vi = aj + adiag[i+1]+1;
420: nz = adiag[i] - adiag[i+1] - 1;
421: idt = 6*i;
422: s1 = t[idt]; s2 = t[1+idt];
423: s3 = t[2+idt];s4 = t[3+idt];
424: s5 = t[4+idt];s6 = t[5+idt];
425: for (m=0; m<nz; m++) {
426: idx = 6*vi[m];
427: x1 = t[idx]; x2 = t[1+idx];
428: x3 = t[2+idx]; x4 = t[3+idx];
429: x5 = t[4+idx]; x6 = t[5+idx];
430: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
431: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
432: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
433: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
434: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
435: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
436: v += 36;
437: }
438: idc = 6*c[i];
439: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
440: v[18]*s4+v[24]*s5+v[30]*s6;
441: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
442: v[19]*s4+v[25]*s5+v[31]*s6;
443: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
444: v[20]*s4+v[26]*s5+v[32]*s6;
445: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
446: v[21]*s4+v[27]*s5+v[33]*s6;
447: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
448: v[22]*s4+v[28]*s5+v[34]*s6;
449: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
450: v[23]*s4+v[29]*s5+v[35]*s6;
451: }
453: ISRestoreIndices(isrow,&rout);
454: ISRestoreIndices(iscol,&cout);
455: VecRestoreArrayRead(bb,&b);
456: VecRestoreArray(xx,&x);
457: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
458: return(0);
459: }
463: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
464: {
465: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
466: IS iscol=a->col,isrow=a->row;
467: PetscErrorCode ierr;
468: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
469: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
470: PetscInt i,nz,idx,idt,idc;
471: const MatScalar *aa=a->a,*v;
472: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
473: const PetscScalar *b;
476: VecGetArrayRead(bb,&b);
477: VecGetArray(xx,&x);
478: t = a->solve_work;
480: ISGetIndices(isrow,&rout); r = rout;
481: ISGetIndices(iscol,&cout); c = cout + (n-1);
483: /* forward solve the lower triangular */
484: idx = 5*(*r++);
485: t[0] = b[idx]; t[1] = b[1+idx];
486: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
487: for (i=1; i<n; i++) {
488: v = aa + 25*ai[i];
489: vi = aj + ai[i];
490: nz = diag[i] - ai[i];
491: idx = 5*(*r++);
492: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
493: s5 = b[4+idx];
494: while (nz--) {
495: idx = 5*(*vi++);
496: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
497: x4 = t[3+idx];x5 = t[4+idx];
498: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
499: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
500: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
501: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
502: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
503: v += 25;
504: }
505: idx = 5*i;
506: t[idx] = s1;t[1+idx] = s2;
507: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
508: }
509: /* backward solve the upper triangular */
510: for (i=n-1; i>=0; i--) {
511: v = aa + 25*diag[i] + 25;
512: vi = aj + diag[i] + 1;
513: nz = ai[i+1] - diag[i] - 1;
514: idt = 5*i;
515: s1 = t[idt]; s2 = t[1+idt];
516: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
517: while (nz--) {
518: idx = 5*(*vi++);
519: x1 = t[idx]; x2 = t[1+idx];
520: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
521: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
522: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
523: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
524: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
525: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
526: v += 25;
527: }
528: idc = 5*(*c--);
529: v = aa + 25*diag[i];
530: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
531: v[15]*s4+v[20]*s5;
532: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
533: v[16]*s4+v[21]*s5;
534: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
535: v[17]*s4+v[22]*s5;
536: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
537: v[18]*s4+v[23]*s5;
538: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
539: v[19]*s4+v[24]*s5;
540: }
542: ISRestoreIndices(isrow,&rout);
543: ISRestoreIndices(iscol,&cout);
544: VecRestoreArrayRead(bb,&b);
545: VecRestoreArray(xx,&x);
546: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
547: return(0);
548: }
552: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
553: {
554: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
555: IS iscol=a->col,isrow=a->row;
556: PetscErrorCode ierr;
557: const PetscInt *r,*c,*rout,*cout;
558: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
559: PetscInt i,nz,idx,idt,idc,m;
560: const MatScalar *aa=a->a,*v;
561: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
562: const PetscScalar *b;
565: VecGetArrayRead(bb,&b);
566: VecGetArray(xx,&x);
567: t = a->solve_work;
569: ISGetIndices(isrow,&rout); r = rout;
570: ISGetIndices(iscol,&cout); c = cout;
572: /* forward solve the lower triangular */
573: idx = 5*r[0];
574: t[0] = b[idx]; t[1] = b[1+idx];
575: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
576: for (i=1; i<n; i++) {
577: v = aa + 25*ai[i];
578: vi = aj + ai[i];
579: nz = ai[i+1] - ai[i];
580: idx = 5*r[i];
581: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
582: s5 = b[4+idx];
583: for (m=0; m<nz; m++) {
584: idx = 5*vi[m];
585: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
586: x4 = t[3+idx];x5 = t[4+idx];
587: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
588: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
589: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
590: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
591: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
592: v += 25;
593: }
594: idx = 5*i;
595: t[idx] = s1;t[1+idx] = s2;
596: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
597: }
598: /* backward solve the upper triangular */
599: for (i=n-1; i>=0; i--) {
600: v = aa + 25*(adiag[i+1]+1);
601: vi = aj + adiag[i+1]+1;
602: nz = adiag[i] - adiag[i+1] - 1;
603: idt = 5*i;
604: s1 = t[idt]; s2 = t[1+idt];
605: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
606: for (m=0; m<nz; m++) {
607: idx = 5*vi[m];
608: x1 = t[idx]; x2 = t[1+idx];
609: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
610: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
611: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
612: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
613: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
614: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
615: v += 25;
616: }
617: idc = 5*c[i];
618: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
619: v[15]*s4+v[20]*s5;
620: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
621: v[16]*s4+v[21]*s5;
622: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
623: v[17]*s4+v[22]*s5;
624: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
625: v[18]*s4+v[23]*s5;
626: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
627: v[19]*s4+v[24]*s5;
628: }
630: ISRestoreIndices(isrow,&rout);
631: ISRestoreIndices(iscol,&cout);
632: VecRestoreArrayRead(bb,&b);
633: VecRestoreArray(xx,&x);
634: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
635: return(0);
636: }
640: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
641: {
642: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
643: IS iscol=a->col,isrow=a->row;
644: PetscErrorCode ierr;
645: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646: PetscInt i,nz,idx,idt,idc;
647: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
648: const MatScalar *aa=a->a,*v;
649: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
650: const PetscScalar *b;
653: VecGetArrayRead(bb,&b);
654: VecGetArray(xx,&x);
655: t = a->solve_work;
657: ISGetIndices(isrow,&rout); r = rout;
658: ISGetIndices(iscol,&cout); c = cout + (n-1);
660: /* forward solve the lower triangular */
661: idx = 4*(*r++);
662: t[0] = b[idx]; t[1] = b[1+idx];
663: t[2] = b[2+idx]; t[3] = b[3+idx];
664: for (i=1; i<n; i++) {
665: v = aa + 16*ai[i];
666: vi = aj + ai[i];
667: nz = diag[i] - ai[i];
668: idx = 4*(*r++);
669: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
670: while (nz--) {
671: idx = 4*(*vi++);
672: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
673: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
674: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
675: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
676: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
677: v += 16;
678: }
679: idx = 4*i;
680: t[idx] = s1;t[1+idx] = s2;
681: t[2+idx] = s3;t[3+idx] = s4;
682: }
683: /* backward solve the upper triangular */
684: for (i=n-1; i>=0; i--) {
685: v = aa + 16*diag[i] + 16;
686: vi = aj + diag[i] + 1;
687: nz = ai[i+1] - diag[i] - 1;
688: idt = 4*i;
689: s1 = t[idt]; s2 = t[1+idt];
690: s3 = t[2+idt];s4 = t[3+idt];
691: while (nz--) {
692: idx = 4*(*vi++);
693: x1 = t[idx]; x2 = t[1+idx];
694: x3 = t[2+idx]; x4 = t[3+idx];
695: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
696: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
697: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
698: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
699: v += 16;
700: }
701: idc = 4*(*c--);
702: v = aa + 16*diag[i];
703: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
704: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
705: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
706: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
707: }
709: ISRestoreIndices(isrow,&rout);
710: ISRestoreIndices(iscol,&cout);
711: VecRestoreArrayRead(bb,&b);
712: VecRestoreArray(xx,&x);
713: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
714: return(0);
715: }
719: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
720: {
721: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
722: IS iscol=a->col,isrow=a->row;
723: PetscErrorCode ierr;
724: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
725: PetscInt i,nz,idx,idt,idc,m;
726: const PetscInt *r,*c,*rout,*cout;
727: const MatScalar *aa=a->a,*v;
728: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
729: const PetscScalar *b;
732: VecGetArrayRead(bb,&b);
733: VecGetArray(xx,&x);
734: t = a->solve_work;
736: ISGetIndices(isrow,&rout); r = rout;
737: ISGetIndices(iscol,&cout); c = cout;
739: /* forward solve the lower triangular */
740: idx = 4*r[0];
741: t[0] = b[idx]; t[1] = b[1+idx];
742: t[2] = b[2+idx]; t[3] = b[3+idx];
743: for (i=1; i<n; i++) {
744: v = aa + 16*ai[i];
745: vi = aj + ai[i];
746: nz = ai[i+1] - ai[i];
747: idx = 4*r[i];
748: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
749: for (m=0; m<nz; m++) {
750: idx = 4*vi[m];
751: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
752: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
753: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
754: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
755: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
756: v += 16;
757: }
758: idx = 4*i;
759: t[idx] = s1;t[1+idx] = s2;
760: t[2+idx] = s3;t[3+idx] = s4;
761: }
762: /* backward solve the upper triangular */
763: for (i=n-1; i>=0; i--) {
764: v = aa + 16*(adiag[i+1]+1);
765: vi = aj + adiag[i+1]+1;
766: nz = adiag[i] - adiag[i+1] - 1;
767: idt = 4*i;
768: s1 = t[idt]; s2 = t[1+idt];
769: s3 = t[2+idt];s4 = t[3+idt];
770: for (m=0; m<nz; m++) {
771: idx = 4*vi[m];
772: x1 = t[idx]; x2 = t[1+idx];
773: x3 = t[2+idx]; x4 = t[3+idx];
774: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
775: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
776: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
777: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
778: v += 16;
779: }
780: idc = 4*c[i];
781: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
782: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
783: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
784: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
785: }
787: ISRestoreIndices(isrow,&rout);
788: ISRestoreIndices(iscol,&cout);
789: VecRestoreArrayRead(bb,&b);
790: VecRestoreArray(xx,&x);
791: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
792: return(0);
793: }
797: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
798: {
799: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
800: IS iscol=a->col,isrow=a->row;
801: PetscErrorCode ierr;
802: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
803: PetscInt i,nz,idx,idt,idc;
804: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
805: const MatScalar *aa=a->a,*v;
806: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
807: PetscScalar *x;
808: const PetscScalar *b;
811: VecGetArrayRead(bb,&b);
812: VecGetArray(xx,&x);
813: t = (MatScalar*)a->solve_work;
815: ISGetIndices(isrow,&rout); r = rout;
816: ISGetIndices(iscol,&cout); c = cout + (n-1);
818: /* forward solve the lower triangular */
819: idx = 4*(*r++);
820: t[0] = (MatScalar)b[idx];
821: t[1] = (MatScalar)b[1+idx];
822: t[2] = (MatScalar)b[2+idx];
823: t[3] = (MatScalar)b[3+idx];
824: for (i=1; i<n; i++) {
825: v = aa + 16*ai[i];
826: vi = aj + ai[i];
827: nz = diag[i] - ai[i];
828: idx = 4*(*r++);
829: s1 = (MatScalar)b[idx];
830: s2 = (MatScalar)b[1+idx];
831: s3 = (MatScalar)b[2+idx];
832: s4 = (MatScalar)b[3+idx];
833: while (nz--) {
834: idx = 4*(*vi++);
835: x1 = t[idx];
836: x2 = t[1+idx];
837: x3 = t[2+idx];
838: x4 = t[3+idx];
839: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
840: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
841: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
842: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
843: v += 16;
844: }
845: idx = 4*i;
846: t[idx] = s1;
847: t[1+idx] = s2;
848: t[2+idx] = s3;
849: t[3+idx] = s4;
850: }
851: /* backward solve the upper triangular */
852: for (i=n-1; i>=0; i--) {
853: v = aa + 16*diag[i] + 16;
854: vi = aj + diag[i] + 1;
855: nz = ai[i+1] - diag[i] - 1;
856: idt = 4*i;
857: s1 = t[idt];
858: s2 = t[1+idt];
859: s3 = t[2+idt];
860: s4 = t[3+idt];
861: while (nz--) {
862: idx = 4*(*vi++);
863: x1 = t[idx];
864: x2 = t[1+idx];
865: x3 = t[2+idx];
866: x4 = t[3+idx];
867: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
868: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
869: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
870: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
871: v += 16;
872: }
873: idc = 4*(*c--);
874: v = aa + 16*diag[i];
875: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
876: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
877: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
878: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
879: x[idc] = (PetscScalar)t[idt];
880: x[1+idc] = (PetscScalar)t[1+idt];
881: x[2+idc] = (PetscScalar)t[2+idt];
882: x[3+idc] = (PetscScalar)t[3+idt];
883: }
885: ISRestoreIndices(isrow,&rout);
886: ISRestoreIndices(iscol,&cout);
887: VecRestoreArrayRead(bb,&b);
888: VecRestoreArray(xx,&x);
889: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
890: return(0);
891: }
893: #if defined(PETSC_HAVE_SSE)
895: #include PETSC_HAVE_SSE
899: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
900: {
901: /*
902: Note: This code uses demotion of double
903: to float when performing the mixed-mode computation.
904: This may not be numerically reasonable for all applications.
905: */
906: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
907: IS iscol=a->col,isrow=a->row;
909: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
910: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
911: MatScalar *aa=a->a,*v;
912: PetscScalar *x,*b,*t;
914: /* Make space in temp stack for 16 Byte Aligned arrays */
915: float ssealignedspace[11],*tmps,*tmpx;
916: unsigned long offset;
919: SSE_SCOPE_BEGIN;
921: offset = (unsigned long)ssealignedspace % 16;
922: if (offset) offset = (16 - offset)/4;
923: tmps = &ssealignedspace[offset];
924: tmpx = &ssealignedspace[offset+4];
925: PREFETCH_NTA(aa+16*ai[1]);
927: VecGetArray(bb,&b);
928: VecGetArray(xx,&x);
929: t = a->solve_work;
931: ISGetIndices(isrow,&rout); r = rout;
932: ISGetIndices(iscol,&cout); c = cout + (n-1);
934: /* forward solve the lower triangular */
935: idx = 4*(*r++);
936: t[0] = b[idx]; t[1] = b[1+idx];
937: t[2] = b[2+idx]; t[3] = b[3+idx];
938: v = aa + 16*ai[1];
940: for (i=1; i<n; ) {
941: PREFETCH_NTA(&v[8]);
942: vi = aj + ai[i];
943: nz = diag[i] - ai[i];
944: idx = 4*(*r++);
946: /* Demote sum from double to float */
947: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
948: LOAD_PS(tmps,XMM7);
950: while (nz--) {
951: PREFETCH_NTA(&v[16]);
952: idx = 4*(*vi++);
954: /* Demote solution (so far) from double to float */
955: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
957: /* 4x4 Matrix-Vector product with negative accumulation: */
958: SSE_INLINE_BEGIN_2(tmpx,v)
959: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
961: /* First Column */
962: SSE_COPY_PS(XMM0,XMM6)
963: SSE_SHUFFLE(XMM0,XMM0,0x00)
964: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
965: SSE_SUB_PS(XMM7,XMM0)
967: /* Second Column */
968: SSE_COPY_PS(XMM1,XMM6)
969: SSE_SHUFFLE(XMM1,XMM1,0x55)
970: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
971: SSE_SUB_PS(XMM7,XMM1)
973: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
975: /* Third Column */
976: SSE_COPY_PS(XMM2,XMM6)
977: SSE_SHUFFLE(XMM2,XMM2,0xAA)
978: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
979: SSE_SUB_PS(XMM7,XMM2)
981: /* Fourth Column */
982: SSE_COPY_PS(XMM3,XMM6)
983: SSE_SHUFFLE(XMM3,XMM3,0xFF)
984: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
985: SSE_SUB_PS(XMM7,XMM3)
986: SSE_INLINE_END_2
988: v += 16;
989: }
990: idx = 4*i;
991: v = aa + 16*ai[++i];
992: PREFETCH_NTA(v);
993: STORE_PS(tmps,XMM7);
995: /* Promote result from float to double */
996: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
997: }
998: /* backward solve the upper triangular */
999: idt = 4*(n-1);
1000: ai16 = 16*diag[n-1];
1001: v = aa + ai16 + 16;
1002: for (i=n-1; i>=0; ) {
1003: PREFETCH_NTA(&v[8]);
1004: vi = aj + diag[i] + 1;
1005: nz = ai[i+1] - diag[i] - 1;
1007: /* Demote accumulator from double to float */
1008: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1009: LOAD_PS(tmps,XMM7);
1011: while (nz--) {
1012: PREFETCH_NTA(&v[16]);
1013: idx = 4*(*vi++);
1015: /* Demote solution (so far) from double to float */
1016: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1018: /* 4x4 Matrix-Vector Product with negative accumulation: */
1019: SSE_INLINE_BEGIN_2(tmpx,v)
1020: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1022: /* First Column */
1023: SSE_COPY_PS(XMM0,XMM6)
1024: SSE_SHUFFLE(XMM0,XMM0,0x00)
1025: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1026: SSE_SUB_PS(XMM7,XMM0)
1028: /* Second Column */
1029: SSE_COPY_PS(XMM1,XMM6)
1030: SSE_SHUFFLE(XMM1,XMM1,0x55)
1031: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1032: SSE_SUB_PS(XMM7,XMM1)
1034: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1036: /* Third Column */
1037: SSE_COPY_PS(XMM2,XMM6)
1038: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1039: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1040: SSE_SUB_PS(XMM7,XMM2)
1042: /* Fourth Column */
1043: SSE_COPY_PS(XMM3,XMM6)
1044: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1045: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1046: SSE_SUB_PS(XMM7,XMM3)
1047: SSE_INLINE_END_2
1048: v += 16;
1049: }
1050: v = aa + ai16;
1051: ai16 = 16*diag[--i];
1052: PREFETCH_NTA(aa+ai16+16);
1053: /*
1054: Scale the result by the diagonal 4x4 block,
1055: which was inverted as part of the factorization
1056: */
1057: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1058: /* First Column */
1059: SSE_COPY_PS(XMM0,XMM7)
1060: SSE_SHUFFLE(XMM0,XMM0,0x00)
1061: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1063: /* Second Column */
1064: SSE_COPY_PS(XMM1,XMM7)
1065: SSE_SHUFFLE(XMM1,XMM1,0x55)
1066: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1067: SSE_ADD_PS(XMM0,XMM1)
1069: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1071: /* Third Column */
1072: SSE_COPY_PS(XMM2,XMM7)
1073: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1074: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1075: SSE_ADD_PS(XMM0,XMM2)
1077: /* Fourth Column */
1078: SSE_COPY_PS(XMM3,XMM7)
1079: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1080: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1081: SSE_ADD_PS(XMM0,XMM3)
1083: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1084: SSE_INLINE_END_3
1086: /* Promote solution from float to double */
1087: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1089: /* Apply reordering to t and stream into x. */
1090: /* This way, x doesn't pollute the cache. */
1091: /* Be careful with size: 2 doubles = 4 floats! */
1092: idc = 4*(*c--);
1093: SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1094: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1095: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1097: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1098: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1100: SSE_INLINE_END_2
1101: v = aa + ai16 + 16;
1102: idt -= 4;
1103: }
1105: ISRestoreIndices(isrow,&rout);
1106: ISRestoreIndices(iscol,&cout);
1107: VecRestoreArray(bb,&b);
1108: VecRestoreArray(xx,&x);
1109: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1110: SSE_SCOPE_END;
1111: return(0);
1112: }
1114: #endif
1118: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1119: {
1120: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1121: IS iscol=a->col,isrow=a->row;
1122: PetscErrorCode ierr;
1123: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1124: PetscInt i,nz,idx,idt,idc;
1125: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1126: const MatScalar *aa=a->a,*v;
1127: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1128: const PetscScalar *b;
1131: VecGetArrayRead(bb,&b);
1132: VecGetArray(xx,&x);
1133: t = a->solve_work;
1135: ISGetIndices(isrow,&rout); r = rout;
1136: ISGetIndices(iscol,&cout); c = cout + (n-1);
1138: /* forward solve the lower triangular */
1139: idx = 3*(*r++);
1140: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1141: for (i=1; i<n; i++) {
1142: v = aa + 9*ai[i];
1143: vi = aj + ai[i];
1144: nz = diag[i] - ai[i];
1145: idx = 3*(*r++);
1146: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1147: while (nz--) {
1148: idx = 3*(*vi++);
1149: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1150: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1151: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1152: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1153: v += 9;
1154: }
1155: idx = 3*i;
1156: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1157: }
1158: /* backward solve the upper triangular */
1159: for (i=n-1; i>=0; i--) {
1160: v = aa + 9*diag[i] + 9;
1161: vi = aj + diag[i] + 1;
1162: nz = ai[i+1] - diag[i] - 1;
1163: idt = 3*i;
1164: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1165: while (nz--) {
1166: idx = 3*(*vi++);
1167: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1168: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1169: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1170: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1171: v += 9;
1172: }
1173: idc = 3*(*c--);
1174: v = aa + 9*diag[i];
1175: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1176: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1177: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1178: }
1179: ISRestoreIndices(isrow,&rout);
1180: ISRestoreIndices(iscol,&cout);
1181: VecRestoreArrayRead(bb,&b);
1182: VecRestoreArray(xx,&x);
1183: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1184: return(0);
1185: }
1189: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1190: {
1191: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1192: IS iscol=a->col,isrow=a->row;
1193: PetscErrorCode ierr;
1194: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1195: PetscInt i,nz,idx,idt,idc,m;
1196: const PetscInt *r,*c,*rout,*cout;
1197: const MatScalar *aa=a->a,*v;
1198: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1199: const PetscScalar *b;
1202: VecGetArrayRead(bb,&b);
1203: VecGetArray(xx,&x);
1204: t = a->solve_work;
1206: ISGetIndices(isrow,&rout); r = rout;
1207: ISGetIndices(iscol,&cout); c = cout;
1209: /* forward solve the lower triangular */
1210: idx = 3*r[0];
1211: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1212: for (i=1; i<n; i++) {
1213: v = aa + 9*ai[i];
1214: vi = aj + ai[i];
1215: nz = ai[i+1] - ai[i];
1216: idx = 3*r[i];
1217: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1218: for (m=0; m<nz; m++) {
1219: idx = 3*vi[m];
1220: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1221: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1222: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1223: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1224: v += 9;
1225: }
1226: idx = 3*i;
1227: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1228: }
1229: /* backward solve the upper triangular */
1230: for (i=n-1; i>=0; i--) {
1231: v = aa + 9*(adiag[i+1]+1);
1232: vi = aj + adiag[i+1]+1;
1233: nz = adiag[i] - adiag[i+1] - 1;
1234: idt = 3*i;
1235: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1236: for (m=0; m<nz; m++) {
1237: idx = 3*vi[m];
1238: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1239: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1240: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1241: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1242: v += 9;
1243: }
1244: idc = 3*c[i];
1245: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1246: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1247: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1248: }
1249: ISRestoreIndices(isrow,&rout);
1250: ISRestoreIndices(iscol,&cout);
1251: VecRestoreArrayRead(bb,&b);
1252: VecRestoreArray(xx,&x);
1253: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1254: return(0);
1255: }
1259: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1260: {
1261: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1262: IS iscol=a->col,isrow=a->row;
1263: PetscErrorCode ierr;
1264: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1265: PetscInt i,nz,idx,idt,idc;
1266: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1267: const MatScalar *aa=a->a,*v;
1268: PetscScalar *x,s1,s2,x1,x2,*t;
1269: const PetscScalar *b;
1272: VecGetArrayRead(bb,&b);
1273: VecGetArray(xx,&x);
1274: t = a->solve_work;
1276: ISGetIndices(isrow,&rout); r = rout;
1277: ISGetIndices(iscol,&cout); c = cout + (n-1);
1279: /* forward solve the lower triangular */
1280: idx = 2*(*r++);
1281: t[0] = b[idx]; t[1] = b[1+idx];
1282: for (i=1; i<n; i++) {
1283: v = aa + 4*ai[i];
1284: vi = aj + ai[i];
1285: nz = diag[i] - ai[i];
1286: idx = 2*(*r++);
1287: s1 = b[idx]; s2 = b[1+idx];
1288: while (nz--) {
1289: idx = 2*(*vi++);
1290: x1 = t[idx]; x2 = t[1+idx];
1291: s1 -= v[0]*x1 + v[2]*x2;
1292: s2 -= v[1]*x1 + v[3]*x2;
1293: v += 4;
1294: }
1295: idx = 2*i;
1296: t[idx] = s1; t[1+idx] = s2;
1297: }
1298: /* backward solve the upper triangular */
1299: for (i=n-1; i>=0; i--) {
1300: v = aa + 4*diag[i] + 4;
1301: vi = aj + diag[i] + 1;
1302: nz = ai[i+1] - diag[i] - 1;
1303: idt = 2*i;
1304: s1 = t[idt]; s2 = t[1+idt];
1305: while (nz--) {
1306: idx = 2*(*vi++);
1307: x1 = t[idx]; x2 = t[1+idx];
1308: s1 -= v[0]*x1 + v[2]*x2;
1309: s2 -= v[1]*x1 + v[3]*x2;
1310: v += 4;
1311: }
1312: idc = 2*(*c--);
1313: v = aa + 4*diag[i];
1314: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1315: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1316: }
1317: ISRestoreIndices(isrow,&rout);
1318: ISRestoreIndices(iscol,&cout);
1319: VecRestoreArrayRead(bb,&b);
1320: VecRestoreArray(xx,&x);
1321: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1322: return(0);
1323: }
1327: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1328: {
1329: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1330: IS iscol=a->col,isrow=a->row;
1331: PetscErrorCode ierr;
1332: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1333: PetscInt i,nz,idx,jdx,idt,idc,m;
1334: const PetscInt *r,*c,*rout,*cout;
1335: const MatScalar *aa=a->a,*v;
1336: PetscScalar *x,s1,s2,x1,x2,*t;
1337: const PetscScalar *b;
1340: VecGetArrayRead(bb,&b);
1341: VecGetArray(xx,&x);
1342: t = a->solve_work;
1344: ISGetIndices(isrow,&rout); r = rout;
1345: ISGetIndices(iscol,&cout); c = cout;
1347: /* forward solve the lower triangular */
1348: idx = 2*r[0];
1349: t[0] = b[idx]; t[1] = b[1+idx];
1350: for (i=1; i<n; i++) {
1351: v = aa + 4*ai[i];
1352: vi = aj + ai[i];
1353: nz = ai[i+1] - ai[i];
1354: idx = 2*r[i];
1355: s1 = b[idx]; s2 = b[1+idx];
1356: for (m=0; m<nz; m++) {
1357: jdx = 2*vi[m];
1358: x1 = t[jdx]; x2 = t[1+jdx];
1359: s1 -= v[0]*x1 + v[2]*x2;
1360: s2 -= v[1]*x1 + v[3]*x2;
1361: v += 4;
1362: }
1363: idx = 2*i;
1364: t[idx] = s1; t[1+idx] = s2;
1365: }
1366: /* backward solve the upper triangular */
1367: for (i=n-1; i>=0; i--) {
1368: v = aa + 4*(adiag[i+1]+1);
1369: vi = aj + adiag[i+1]+1;
1370: nz = adiag[i] - adiag[i+1] - 1;
1371: idt = 2*i;
1372: s1 = t[idt]; s2 = t[1+idt];
1373: for (m=0; m<nz; m++) {
1374: idx = 2*vi[m];
1375: x1 = t[idx]; x2 = t[1+idx];
1376: s1 -= v[0]*x1 + v[2]*x2;
1377: s2 -= v[1]*x1 + v[3]*x2;
1378: v += 4;
1379: }
1380: idc = 2*c[i];
1381: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1382: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1383: }
1384: ISRestoreIndices(isrow,&rout);
1385: ISRestoreIndices(iscol,&cout);
1386: VecRestoreArrayRead(bb,&b);
1387: VecRestoreArray(xx,&x);
1388: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1389: return(0);
1390: }
1394: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1395: {
1396: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1397: IS iscol=a->col,isrow=a->row;
1398: PetscErrorCode ierr;
1399: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1400: PetscInt i,nz;
1401: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1402: const MatScalar *aa=a->a,*v;
1403: PetscScalar *x,s1,*t;
1404: const PetscScalar *b;
1407: if (!n) return(0);
1409: VecGetArrayRead(bb,&b);
1410: VecGetArray(xx,&x);
1411: t = a->solve_work;
1413: ISGetIndices(isrow,&rout); r = rout;
1414: ISGetIndices(iscol,&cout); c = cout + (n-1);
1416: /* forward solve the lower triangular */
1417: t[0] = b[*r++];
1418: for (i=1; i<n; i++) {
1419: v = aa + ai[i];
1420: vi = aj + ai[i];
1421: nz = diag[i] - ai[i];
1422: s1 = b[*r++];
1423: while (nz--) {
1424: s1 -= (*v++)*t[*vi++];
1425: }
1426: t[i] = s1;
1427: }
1428: /* backward solve the upper triangular */
1429: for (i=n-1; i>=0; i--) {
1430: v = aa + diag[i] + 1;
1431: vi = aj + diag[i] + 1;
1432: nz = ai[i+1] - diag[i] - 1;
1433: s1 = t[i];
1434: while (nz--) {
1435: s1 -= (*v++)*t[*vi++];
1436: }
1437: x[*c--] = t[i] = aa[diag[i]]*s1;
1438: }
1440: ISRestoreIndices(isrow,&rout);
1441: ISRestoreIndices(iscol,&cout);
1442: VecRestoreArrayRead(bb,&b);
1443: VecRestoreArray(xx,&x);
1444: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1445: return(0);
1446: }
1450: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1451: {
1452: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1453: IS iscol = a->col,isrow = a->row;
1454: PetscErrorCode ierr;
1455: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1456: const PetscInt *rout,*cout,*r,*c;
1457: PetscScalar *x,*tmp,sum;
1458: const PetscScalar *b;
1459: const MatScalar *aa = a->a,*v;
1462: if (!n) return(0);
1464: VecGetArrayRead(bb,&b);
1465: VecGetArray(xx,&x);
1466: tmp = a->solve_work;
1468: ISGetIndices(isrow,&rout); r = rout;
1469: ISGetIndices(iscol,&cout); c = cout;
1471: /* forward solve the lower triangular */
1472: tmp[0] = b[r[0]];
1473: v = aa;
1474: vi = aj;
1475: for (i=1; i<n; i++) {
1476: nz = ai[i+1] - ai[i];
1477: sum = b[r[i]];
1478: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1479: tmp[i] = sum;
1480: v += nz; vi += nz;
1481: }
1483: /* backward solve the upper triangular */
1484: for (i=n-1; i>=0; i--) {
1485: v = aa + adiag[i+1]+1;
1486: vi = aj + adiag[i+1]+1;
1487: nz = adiag[i]-adiag[i+1]-1;
1488: sum = tmp[i];
1489: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1490: x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1491: }
1493: ISRestoreIndices(isrow,&rout);
1494: ISRestoreIndices(iscol,&cout);
1495: VecRestoreArrayRead(bb,&b);
1496: VecRestoreArray(xx,&x);
1497: PetscLogFlops(2*a->nz - A->cmap->n);
1498: return(0);
1499: }