Actual source code: baijsolv.c

petsc-3.7.3 2016-08-01
Report Typos and Errors
  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  6: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
  7: {
  8:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
  9:   IS                iscol=a->col,isrow=a->row;
 10:   PetscErrorCode    ierr;
 11:   const PetscInt    *r,*c,*rout,*cout;
 12:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
 13:   PetscInt          i,nz;
 14:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
 15:   const MatScalar   *aa=a->a,*v;
 16:   PetscScalar       *x,*s,*t,*ls;
 17:   const PetscScalar *b;

 20:   VecGetArrayRead(bb,&b);
 21:   VecGetArray(xx,&x);
 22:   t    = a->solve_work;

 24:   ISGetIndices(isrow,&rout); r = rout;
 25:   ISGetIndices(iscol,&cout); c = cout + (n-1);

 27:   /* forward solve the lower triangular */
 28:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
 29:   for (i=1; i<n; i++) {
 30:     v    = aa + bs2*ai[i];
 31:     vi   = aj + ai[i];
 32:     nz   = a->diag[i] - ai[i];
 33:     s    = t + bs*i;
 34:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
 35:     while (nz--) {
 36:       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
 37:       v += bs2;
 38:     }
 39:   }
 40:   /* backward solve the upper triangular */
 41:   ls = a->solve_work + A->cmap->n;
 42:   for (i=n-1; i>=0; i--) {
 43:     v    = aa + bs2*(a->diag[i] + 1);
 44:     vi   = aj + a->diag[i] + 1;
 45:     nz   = ai[i+1] - a->diag[i] - 1;
 46:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
 47:     while (nz--) {
 48:       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
 49:       v += bs2;
 50:     }
 51:     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
 52:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
 53:   }

 55:   ISRestoreIndices(isrow,&rout);
 56:   ISRestoreIndices(iscol,&cout);
 57:   VecRestoreArrayRead(bb,&b);
 58:   VecRestoreArray(xx,&x);
 59:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
 60:   return(0);
 61: }

 65: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
 66: {
 67:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
 68:   IS                iscol=a->col,isrow=a->row;
 69:   PetscErrorCode    ierr;
 70:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
 71:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
 72:   PetscInt          i,nz,idx,idt,idc;
 73:   const MatScalar   *aa=a->a,*v;
 74:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
 75:   const PetscScalar *b;

 78:   VecGetArrayRead(bb,&b);
 79:   VecGetArray(xx,&x);
 80:   t    = a->solve_work;

 82:   ISGetIndices(isrow,&rout); r = rout;
 83:   ISGetIndices(iscol,&cout); c = cout + (n-1);

 85:   /* forward solve the lower triangular */
 86:   idx  = 7*(*r++);
 87:   t[0] = b[idx];   t[1] = b[1+idx];
 88:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
 89:   t[5] = b[5+idx]; t[6] = b[6+idx];

 91:   for (i=1; i<n; i++) {
 92:     v   = aa + 49*ai[i];
 93:     vi  = aj + ai[i];
 94:     nz  = diag[i] - ai[i];
 95:     idx = 7*(*r++);
 96:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
 97:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
 98:     while (nz--) {
 99:       idx = 7*(*vi++);
100:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
101:       x4  = t[3+idx];x5 = t[4+idx];
102:       x6  = t[5+idx];x7 = t[6+idx];
103:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
104:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
105:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
106:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
107:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
108:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
109:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
110:       v  += 49;
111:     }
112:     idx      = 7*i;
113:     t[idx]   = s1;t[1+idx] = s2;
114:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
115:     t[5+idx] = s6;t[6+idx] = s7;
116:   }
117:   /* backward solve the upper triangular */
118:   for (i=n-1; i>=0; i--) {
119:     v   = aa + 49*diag[i] + 49;
120:     vi  = aj + diag[i] + 1;
121:     nz  = ai[i+1] - diag[i] - 1;
122:     idt = 7*i;
123:     s1  = t[idt];  s2 = t[1+idt];
124:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
125:     s6  = t[5+idt];s7 = t[6+idt];
126:     while (nz--) {
127:       idx = 7*(*vi++);
128:       x1  = t[idx];   x2 = t[1+idx];
129:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
130:       x6  = t[5+idx]; x7 = t[6+idx];
131:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
132:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
133:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
134:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
135:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
136:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
137:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
138:       v  += 49;
139:     }
140:     idc    = 7*(*c--);
141:     v      = aa + 49*diag[i];
142:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
143:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
144:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
145:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
146:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
147:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
148:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
149:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
150:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
151:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
152:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
153:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
154:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
155:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
156:   }

158:   ISRestoreIndices(isrow,&rout);
159:   ISRestoreIndices(iscol,&cout);
160:   VecRestoreArrayRead(bb,&b);
161:   VecRestoreArray(xx,&x);
162:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
163:   return(0);
164: }

168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
169: {
170:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
171:   IS                iscol=a->col,isrow=a->row;
172:   PetscErrorCode    ierr;
173:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
174:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
175:   PetscInt          i,nz,idx,idt,idc,m;
176:   const MatScalar   *aa=a->a,*v;
177:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
178:   const PetscScalar *b;

181:   VecGetArrayRead(bb,&b);
182:   VecGetArray(xx,&x);
183:   t    = a->solve_work;

185:   ISGetIndices(isrow,&rout); r = rout;
186:   ISGetIndices(iscol,&cout); c = cout;

188:   /* forward solve the lower triangular */
189:   idx  = 7*r[0];
190:   t[0] = b[idx];   t[1] = b[1+idx];
191:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
192:   t[5] = b[5+idx]; t[6] = b[6+idx];

194:   for (i=1; i<n; i++) {
195:     v   = aa + 49*ai[i];
196:     vi  = aj + ai[i];
197:     nz  = ai[i+1] - ai[i];
198:     idx = 7*r[i];
199:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
200:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
201:     for (m=0; m<nz; m++) {
202:       idx = 7*vi[m];
203:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
204:       x4  = t[3+idx];x5 = t[4+idx];
205:       x6  = t[5+idx];x7 = t[6+idx];
206:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
207:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
208:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
209:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
210:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
211:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
212:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
213:       v  += 49;
214:     }
215:     idx      = 7*i;
216:     t[idx]   = s1;t[1+idx] = s2;
217:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
218:     t[5+idx] = s6;t[6+idx] = s7;
219:   }
220:   /* backward solve the upper triangular */
221:   for (i=n-1; i>=0; i--) {
222:     v   = aa + 49*(adiag[i+1]+1);
223:     vi  = aj + adiag[i+1]+1;
224:     nz  = adiag[i] - adiag[i+1] - 1;
225:     idt = 7*i;
226:     s1  = t[idt];  s2 = t[1+idt];
227:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
228:     s6  = t[5+idt];s7 = t[6+idt];
229:     for (m=0; m<nz; m++) {
230:       idx = 7*vi[m];
231:       x1  = t[idx];   x2 = t[1+idx];
232:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
233:       x6  = t[5+idx]; x7 = t[6+idx];
234:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
235:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
236:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
237:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
238:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
239:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
240:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
241:       v  += 49;
242:     }
243:     idc    = 7*c[i];
244:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
245:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
246:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
247:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
248:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
249:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
250:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
251:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
252:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
253:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
254:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
255:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
256:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
257:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
258:   }

260:   ISRestoreIndices(isrow,&rout);
261:   ISRestoreIndices(iscol,&cout);
262:   VecRestoreArrayRead(bb,&b);
263:   VecRestoreArray(xx,&x);
264:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
265:   return(0);
266: }

270: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
271: {
272:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
273:   IS                iscol=a->col,isrow=a->row;
274:   PetscErrorCode    ierr;
275:   const PetscInt    *r,*c,*rout,*cout;
276:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
277:   PetscInt          i,nz,idx,idt,idc;
278:   const MatScalar   *aa=a->a,*v;
279:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
280:   const PetscScalar *b;

283:   VecGetArrayRead(bb,&b);
284:   VecGetArray(xx,&x);
285:   t    = a->solve_work;

287:   ISGetIndices(isrow,&rout); r = rout;
288:   ISGetIndices(iscol,&cout); c = cout + (n-1);

290:   /* forward solve the lower triangular */
291:   idx  = 6*(*r++);
292:   t[0] = b[idx];   t[1] = b[1+idx];
293:   t[2] = b[2+idx]; t[3] = b[3+idx];
294:   t[4] = b[4+idx]; t[5] = b[5+idx];
295:   for (i=1; i<n; i++) {
296:     v   = aa + 36*ai[i];
297:     vi  = aj + ai[i];
298:     nz  = diag[i] - ai[i];
299:     idx = 6*(*r++);
300:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
301:     s5  = b[4+idx]; s6 = b[5+idx];
302:     while (nz--) {
303:       idx = 6*(*vi++);
304:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
305:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
306:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
307:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
308:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
309:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
310:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
311:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
312:       v  += 36;
313:     }
314:     idx      = 6*i;
315:     t[idx]   = s1;t[1+idx] = s2;
316:     t[2+idx] = s3;t[3+idx] = s4;
317:     t[4+idx] = s5;t[5+idx] = s6;
318:   }
319:   /* backward solve the upper triangular */
320:   for (i=n-1; i>=0; i--) {
321:     v   = aa + 36*diag[i] + 36;
322:     vi  = aj + diag[i] + 1;
323:     nz  = ai[i+1] - diag[i] - 1;
324:     idt = 6*i;
325:     s1  = t[idt];  s2 = t[1+idt];
326:     s3  = t[2+idt];s4 = t[3+idt];
327:     s5  = t[4+idt];s6 = t[5+idt];
328:     while (nz--) {
329:       idx = 6*(*vi++);
330:       x1  = t[idx];   x2 = t[1+idx];
331:       x3  = t[2+idx]; x4 = t[3+idx];
332:       x5  = t[4+idx]; x6 = t[5+idx];
333:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
334:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
335:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
336:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
337:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
338:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
339:       v  += 36;
340:     }
341:     idc    = 6*(*c--);
342:     v      = aa + 36*diag[i];
343:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
344:                         v[18]*s4+v[24]*s5+v[30]*s6;
345:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
346:                           v[19]*s4+v[25]*s5+v[31]*s6;
347:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
348:                           v[20]*s4+v[26]*s5+v[32]*s6;
349:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
350:                           v[21]*s4+v[27]*s5+v[33]*s6;
351:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
352:                           v[22]*s4+v[28]*s5+v[34]*s6;
353:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
354:                           v[23]*s4+v[29]*s5+v[35]*s6;
355:   }

357:   ISRestoreIndices(isrow,&rout);
358:   ISRestoreIndices(iscol,&cout);
359:   VecRestoreArrayRead(bb,&b);
360:   VecRestoreArray(xx,&x);
361:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
362:   return(0);
363: }

367: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
368: {
369:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
370:   IS                iscol=a->col,isrow=a->row;
371:   PetscErrorCode    ierr;
372:   const PetscInt    *r,*c,*rout,*cout;
373:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
374:   PetscInt          i,nz,idx,idt,idc,m;
375:   const MatScalar   *aa=a->a,*v;
376:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
377:   const PetscScalar *b;

380:   VecGetArrayRead(bb,&b);
381:   VecGetArray(xx,&x);
382:   t    = a->solve_work;

384:   ISGetIndices(isrow,&rout); r = rout;
385:   ISGetIndices(iscol,&cout); c = cout;

387:   /* forward solve the lower triangular */
388:   idx  = 6*r[0];
389:   t[0] = b[idx];   t[1] = b[1+idx];
390:   t[2] = b[2+idx]; t[3] = b[3+idx];
391:   t[4] = b[4+idx]; t[5] = b[5+idx];
392:   for (i=1; i<n; i++) {
393:     v   = aa + 36*ai[i];
394:     vi  = aj + ai[i];
395:     nz  = ai[i+1] - ai[i];
396:     idx = 6*r[i];
397:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
398:     s5  = b[4+idx]; s6 = b[5+idx];
399:     for (m=0; m<nz; m++) {
400:       idx = 6*vi[m];
401:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
402:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
403:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
404:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
405:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
406:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
407:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
408:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
409:       v  += 36;
410:     }
411:     idx      = 6*i;
412:     t[idx]   = s1;t[1+idx] = s2;
413:     t[2+idx] = s3;t[3+idx] = s4;
414:     t[4+idx] = s5;t[5+idx] = s6;
415:   }
416:   /* backward solve the upper triangular */
417:   for (i=n-1; i>=0; i--) {
418:     v   = aa + 36*(adiag[i+1]+1);
419:     vi  = aj + adiag[i+1]+1;
420:     nz  = adiag[i] - adiag[i+1] - 1;
421:     idt = 6*i;
422:     s1  = t[idt];  s2 = t[1+idt];
423:     s3  = t[2+idt];s4 = t[3+idt];
424:     s5  = t[4+idt];s6 = t[5+idt];
425:     for (m=0; m<nz; m++) {
426:       idx = 6*vi[m];
427:       x1  = t[idx];   x2 = t[1+idx];
428:       x3  = t[2+idx]; x4 = t[3+idx];
429:       x5  = t[4+idx]; x6 = t[5+idx];
430:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
431:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
432:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
433:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
434:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
435:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
436:       v  += 36;
437:     }
438:     idc    = 6*c[i];
439:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
440:                         v[18]*s4+v[24]*s5+v[30]*s6;
441:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
442:                           v[19]*s4+v[25]*s5+v[31]*s6;
443:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
444:                           v[20]*s4+v[26]*s5+v[32]*s6;
445:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
446:                           v[21]*s4+v[27]*s5+v[33]*s6;
447:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
448:                           v[22]*s4+v[28]*s5+v[34]*s6;
449:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
450:                           v[23]*s4+v[29]*s5+v[35]*s6;
451:   }

453:   ISRestoreIndices(isrow,&rout);
454:   ISRestoreIndices(iscol,&cout);
455:   VecRestoreArrayRead(bb,&b);
456:   VecRestoreArray(xx,&x);
457:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
458:   return(0);
459: }

463: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
464: {
465:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
466:   IS                iscol=a->col,isrow=a->row;
467:   PetscErrorCode    ierr;
468:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
469:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
470:   PetscInt          i,nz,idx,idt,idc;
471:   const MatScalar   *aa=a->a,*v;
472:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
473:   const PetscScalar *b;

476:   VecGetArrayRead(bb,&b);
477:   VecGetArray(xx,&x);
478:   t    = a->solve_work;

480:   ISGetIndices(isrow,&rout); r = rout;
481:   ISGetIndices(iscol,&cout); c = cout + (n-1);

483:   /* forward solve the lower triangular */
484:   idx  = 5*(*r++);
485:   t[0] = b[idx];   t[1] = b[1+idx];
486:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
487:   for (i=1; i<n; i++) {
488:     v   = aa + 25*ai[i];
489:     vi  = aj + ai[i];
490:     nz  = diag[i] - ai[i];
491:     idx = 5*(*r++);
492:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
493:     s5  = b[4+idx];
494:     while (nz--) {
495:       idx = 5*(*vi++);
496:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
497:       x4  = t[3+idx];x5 = t[4+idx];
498:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
499:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
500:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
501:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
502:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
503:       v  += 25;
504:     }
505:     idx      = 5*i;
506:     t[idx]   = s1;t[1+idx] = s2;
507:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
508:   }
509:   /* backward solve the upper triangular */
510:   for (i=n-1; i>=0; i--) {
511:     v   = aa + 25*diag[i] + 25;
512:     vi  = aj + diag[i] + 1;
513:     nz  = ai[i+1] - diag[i] - 1;
514:     idt = 5*i;
515:     s1  = t[idt];  s2 = t[1+idt];
516:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
517:     while (nz--) {
518:       idx = 5*(*vi++);
519:       x1  = t[idx];   x2 = t[1+idx];
520:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
521:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
522:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
523:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
524:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
525:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
526:       v  += 25;
527:     }
528:     idc    = 5*(*c--);
529:     v      = aa + 25*diag[i];
530:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
531:                         v[15]*s4+v[20]*s5;
532:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
533:                           v[16]*s4+v[21]*s5;
534:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
535:                           v[17]*s4+v[22]*s5;
536:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
537:                           v[18]*s4+v[23]*s5;
538:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
539:                           v[19]*s4+v[24]*s5;
540:   }

542:   ISRestoreIndices(isrow,&rout);
543:   ISRestoreIndices(iscol,&cout);
544:   VecRestoreArrayRead(bb,&b);
545:   VecRestoreArray(xx,&x);
546:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
547:   return(0);
548: }

552: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
553: {
554:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
555:   IS                iscol=a->col,isrow=a->row;
556:   PetscErrorCode    ierr;
557:   const PetscInt    *r,*c,*rout,*cout;
558:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
559:   PetscInt          i,nz,idx,idt,idc,m;
560:   const MatScalar   *aa=a->a,*v;
561:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
562:   const PetscScalar *b;

565:   VecGetArrayRead(bb,&b);
566:   VecGetArray(xx,&x);
567:   t    = a->solve_work;

569:   ISGetIndices(isrow,&rout); r = rout;
570:   ISGetIndices(iscol,&cout); c = cout;

572:   /* forward solve the lower triangular */
573:   idx  = 5*r[0];
574:   t[0] = b[idx];   t[1] = b[1+idx];
575:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
576:   for (i=1; i<n; i++) {
577:     v   = aa + 25*ai[i];
578:     vi  = aj + ai[i];
579:     nz  = ai[i+1] - ai[i];
580:     idx = 5*r[i];
581:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
582:     s5  = b[4+idx];
583:     for (m=0; m<nz; m++) {
584:       idx = 5*vi[m];
585:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
586:       x4  = t[3+idx];x5 = t[4+idx];
587:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
588:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
589:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
590:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
591:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
592:       v  += 25;
593:     }
594:     idx      = 5*i;
595:     t[idx]   = s1;t[1+idx] = s2;
596:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
597:   }
598:   /* backward solve the upper triangular */
599:   for (i=n-1; i>=0; i--) {
600:     v   = aa + 25*(adiag[i+1]+1);
601:     vi  = aj + adiag[i+1]+1;
602:     nz  = adiag[i] - adiag[i+1] - 1;
603:     idt = 5*i;
604:     s1  = t[idt];  s2 = t[1+idt];
605:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
606:     for (m=0; m<nz; m++) {
607:       idx = 5*vi[m];
608:       x1  = t[idx];   x2 = t[1+idx];
609:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
610:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
611:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
612:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
613:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
614:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
615:       v  += 25;
616:     }
617:     idc    = 5*c[i];
618:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
619:                         v[15]*s4+v[20]*s5;
620:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
621:                           v[16]*s4+v[21]*s5;
622:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
623:                           v[17]*s4+v[22]*s5;
624:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
625:                           v[18]*s4+v[23]*s5;
626:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
627:                           v[19]*s4+v[24]*s5;
628:   }

630:   ISRestoreIndices(isrow,&rout);
631:   ISRestoreIndices(iscol,&cout);
632:   VecRestoreArrayRead(bb,&b);
633:   VecRestoreArray(xx,&x);
634:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
635:   return(0);
636: }

640: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
641: {
642:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
643:   IS                iscol=a->col,isrow=a->row;
644:   PetscErrorCode    ierr;
645:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646:   PetscInt          i,nz,idx,idt,idc;
647:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
648:   const MatScalar   *aa=a->a,*v;
649:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
650:   const PetscScalar *b;

653:   VecGetArrayRead(bb,&b);
654:   VecGetArray(xx,&x);
655:   t    = a->solve_work;

657:   ISGetIndices(isrow,&rout); r = rout;
658:   ISGetIndices(iscol,&cout); c = cout + (n-1);

660:   /* forward solve the lower triangular */
661:   idx  = 4*(*r++);
662:   t[0] = b[idx];   t[1] = b[1+idx];
663:   t[2] = b[2+idx]; t[3] = b[3+idx];
664:   for (i=1; i<n; i++) {
665:     v   = aa + 16*ai[i];
666:     vi  = aj + ai[i];
667:     nz  = diag[i] - ai[i];
668:     idx = 4*(*r++);
669:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
670:     while (nz--) {
671:       idx = 4*(*vi++);
672:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
673:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
674:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
675:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
676:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
677:       v  += 16;
678:     }
679:     idx      = 4*i;
680:     t[idx]   = s1;t[1+idx] = s2;
681:     t[2+idx] = s3;t[3+idx] = s4;
682:   }
683:   /* backward solve the upper triangular */
684:   for (i=n-1; i>=0; i--) {
685:     v   = aa + 16*diag[i] + 16;
686:     vi  = aj + diag[i] + 1;
687:     nz  = ai[i+1] - diag[i] - 1;
688:     idt = 4*i;
689:     s1  = t[idt];  s2 = t[1+idt];
690:     s3  = t[2+idt];s4 = t[3+idt];
691:     while (nz--) {
692:       idx = 4*(*vi++);
693:       x1  = t[idx];   x2 = t[1+idx];
694:       x3  = t[2+idx]; x4 = t[3+idx];
695:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
696:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
697:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
698:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
699:       v  += 16;
700:     }
701:     idc      = 4*(*c--);
702:     v        = aa + 16*diag[i];
703:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
704:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
705:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
706:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
707:   }

709:   ISRestoreIndices(isrow,&rout);
710:   ISRestoreIndices(iscol,&cout);
711:   VecRestoreArrayRead(bb,&b);
712:   VecRestoreArray(xx,&x);
713:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
714:   return(0);
715: }

719: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
720: {
721:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
722:   IS                iscol=a->col,isrow=a->row;
723:   PetscErrorCode    ierr;
724:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
725:   PetscInt          i,nz,idx,idt,idc,m;
726:   const PetscInt    *r,*c,*rout,*cout;
727:   const MatScalar   *aa=a->a,*v;
728:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
729:   const PetscScalar *b;

732:   VecGetArrayRead(bb,&b);
733:   VecGetArray(xx,&x);
734:   t    = a->solve_work;

736:   ISGetIndices(isrow,&rout); r = rout;
737:   ISGetIndices(iscol,&cout); c = cout;

739:   /* forward solve the lower triangular */
740:   idx  = 4*r[0];
741:   t[0] = b[idx];   t[1] = b[1+idx];
742:   t[2] = b[2+idx]; t[3] = b[3+idx];
743:   for (i=1; i<n; i++) {
744:     v   = aa + 16*ai[i];
745:     vi  = aj + ai[i];
746:     nz  = ai[i+1] - ai[i];
747:     idx = 4*r[i];
748:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
749:     for (m=0; m<nz; m++) {
750:       idx = 4*vi[m];
751:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
752:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
753:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
754:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
755:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
756:       v  += 16;
757:     }
758:     idx      = 4*i;
759:     t[idx]   = s1;t[1+idx] = s2;
760:     t[2+idx] = s3;t[3+idx] = s4;
761:   }
762:   /* backward solve the upper triangular */
763:   for (i=n-1; i>=0; i--) {
764:     v   = aa + 16*(adiag[i+1]+1);
765:     vi  = aj + adiag[i+1]+1;
766:     nz  = adiag[i] - adiag[i+1] - 1;
767:     idt = 4*i;
768:     s1  = t[idt];  s2 = t[1+idt];
769:     s3  = t[2+idt];s4 = t[3+idt];
770:     for (m=0; m<nz; m++) {
771:       idx = 4*vi[m];
772:       x1  = t[idx];   x2 = t[1+idx];
773:       x3  = t[2+idx]; x4 = t[3+idx];
774:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
775:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
776:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
777:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
778:       v  += 16;
779:     }
780:     idc      = 4*c[i];
781:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
782:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
783:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
784:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
785:   }

787:   ISRestoreIndices(isrow,&rout);
788:   ISRestoreIndices(iscol,&cout);
789:   VecRestoreArrayRead(bb,&b);
790:   VecRestoreArray(xx,&x);
791:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
792:   return(0);
793: }

797: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
798: {
799:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
800:   IS                iscol=a->col,isrow=a->row;
801:   PetscErrorCode    ierr;
802:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
803:   PetscInt          i,nz,idx,idt,idc;
804:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
805:   const MatScalar   *aa=a->a,*v;
806:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
807:   PetscScalar       *x;
808:   const PetscScalar *b;

811:   VecGetArrayRead(bb,&b);
812:   VecGetArray(xx,&x);
813:   t    = (MatScalar*)a->solve_work;

815:   ISGetIndices(isrow,&rout); r = rout;
816:   ISGetIndices(iscol,&cout); c = cout + (n-1);

818:   /* forward solve the lower triangular */
819:   idx  = 4*(*r++);
820:   t[0] = (MatScalar)b[idx];
821:   t[1] = (MatScalar)b[1+idx];
822:   t[2] = (MatScalar)b[2+idx];
823:   t[3] = (MatScalar)b[3+idx];
824:   for (i=1; i<n; i++) {
825:     v   = aa + 16*ai[i];
826:     vi  = aj + ai[i];
827:     nz  = diag[i] - ai[i];
828:     idx = 4*(*r++);
829:     s1  = (MatScalar)b[idx];
830:     s2  = (MatScalar)b[1+idx];
831:     s3  = (MatScalar)b[2+idx];
832:     s4  = (MatScalar)b[3+idx];
833:     while (nz--) {
834:       idx = 4*(*vi++);
835:       x1  = t[idx];
836:       x2  = t[1+idx];
837:       x3  = t[2+idx];
838:       x4  = t[3+idx];
839:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
840:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
841:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
842:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
843:       v  += 16;
844:     }
845:     idx      = 4*i;
846:     t[idx]   = s1;
847:     t[1+idx] = s2;
848:     t[2+idx] = s3;
849:     t[3+idx] = s4;
850:   }
851:   /* backward solve the upper triangular */
852:   for (i=n-1; i>=0; i--) {
853:     v   = aa + 16*diag[i] + 16;
854:     vi  = aj + diag[i] + 1;
855:     nz  = ai[i+1] - diag[i] - 1;
856:     idt = 4*i;
857:     s1  = t[idt];
858:     s2  = t[1+idt];
859:     s3  = t[2+idt];
860:     s4  = t[3+idt];
861:     while (nz--) {
862:       idx = 4*(*vi++);
863:       x1  = t[idx];
864:       x2  = t[1+idx];
865:       x3  = t[2+idx];
866:       x4  = t[3+idx];
867:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
868:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
869:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
870:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
871:       v  += 16;
872:     }
873:     idc      = 4*(*c--);
874:     v        = aa + 16*diag[i];
875:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
876:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
877:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
878:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
879:     x[idc]   = (PetscScalar)t[idt];
880:     x[1+idc] = (PetscScalar)t[1+idt];
881:     x[2+idc] = (PetscScalar)t[2+idt];
882:     x[3+idc] = (PetscScalar)t[3+idt];
883:   }

885:   ISRestoreIndices(isrow,&rout);
886:   ISRestoreIndices(iscol,&cout);
887:   VecRestoreArrayRead(bb,&b);
888:   VecRestoreArray(xx,&x);
889:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
890:   return(0);
891: }

893: #if defined(PETSC_HAVE_SSE)

895: #include PETSC_HAVE_SSE

899: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
900: {
901:   /*
902:      Note: This code uses demotion of double
903:      to float when performing the mixed-mode computation.
904:      This may not be numerically reasonable for all applications.
905:   */
906:   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
907:   IS             iscol=a->col,isrow=a->row;
909:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
910:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
911:   MatScalar      *aa=a->a,*v;
912:   PetscScalar    *x,*b,*t;

914:   /* Make space in temp stack for 16 Byte Aligned arrays */
915:   float         ssealignedspace[11],*tmps,*tmpx;
916:   unsigned long offset;

919:   SSE_SCOPE_BEGIN;

921:   offset = (unsigned long)ssealignedspace % 16;
922:   if (offset) offset = (16 - offset)/4;
923:   tmps = &ssealignedspace[offset];
924:   tmpx = &ssealignedspace[offset+4];
925:   PREFETCH_NTA(aa+16*ai[1]);

927:   VecGetArray(bb,&b);
928:   VecGetArray(xx,&x);
929:   t    = a->solve_work;

931:   ISGetIndices(isrow,&rout); r = rout;
932:   ISGetIndices(iscol,&cout); c = cout + (n-1);

934:   /* forward solve the lower triangular */
935:   idx  = 4*(*r++);
936:   t[0] = b[idx];   t[1] = b[1+idx];
937:   t[2] = b[2+idx]; t[3] = b[3+idx];
938:   v    =  aa + 16*ai[1];

940:   for (i=1; i<n; ) {
941:     PREFETCH_NTA(&v[8]);
942:     vi  =  aj      + ai[i];
943:     nz  =  diag[i] - ai[i];
944:     idx =  4*(*r++);

946:     /* Demote sum from double to float */
947:     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
948:     LOAD_PS(tmps,XMM7);

950:     while (nz--) {
951:       PREFETCH_NTA(&v[16]);
952:       idx = 4*(*vi++);

954:       /* Demote solution (so far) from double to float */
955:       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

957:       /* 4x4 Matrix-Vector product with negative accumulation: */
958:       SSE_INLINE_BEGIN_2(tmpx,v)
959:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

961:       /* First Column */
962:       SSE_COPY_PS(XMM0,XMM6)
963:       SSE_SHUFFLE(XMM0,XMM0,0x00)
964:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
965:       SSE_SUB_PS(XMM7,XMM0)

967:       /* Second Column */
968:       SSE_COPY_PS(XMM1,XMM6)
969:       SSE_SHUFFLE(XMM1,XMM1,0x55)
970:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
971:       SSE_SUB_PS(XMM7,XMM1)

973:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

975:       /* Third Column */
976:       SSE_COPY_PS(XMM2,XMM6)
977:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
978:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
979:       SSE_SUB_PS(XMM7,XMM2)

981:       /* Fourth Column */
982:       SSE_COPY_PS(XMM3,XMM6)
983:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
984:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
985:       SSE_SUB_PS(XMM7,XMM3)
986:       SSE_INLINE_END_2

988:       v += 16;
989:     }
990:     idx = 4*i;
991:     v   = aa + 16*ai[++i];
992:     PREFETCH_NTA(v);
993:     STORE_PS(tmps,XMM7);

995:     /* Promote result from float to double */
996:     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
997:   }
998:   /* backward solve the upper triangular */
999:   idt  = 4*(n-1);
1000:   ai16 = 16*diag[n-1];
1001:   v    = aa + ai16 + 16;
1002:   for (i=n-1; i>=0; ) {
1003:     PREFETCH_NTA(&v[8]);
1004:     vi = aj + diag[i] + 1;
1005:     nz = ai[i+1] - diag[i] - 1;

1007:     /* Demote accumulator from double to float */
1008:     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1009:     LOAD_PS(tmps,XMM7);

1011:     while (nz--) {
1012:       PREFETCH_NTA(&v[16]);
1013:       idx = 4*(*vi++);

1015:       /* Demote solution (so far) from double to float */
1016:       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1018:       /* 4x4 Matrix-Vector Product with negative accumulation: */
1019:       SSE_INLINE_BEGIN_2(tmpx,v)
1020:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1022:       /* First Column */
1023:       SSE_COPY_PS(XMM0,XMM6)
1024:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1025:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1026:       SSE_SUB_PS(XMM7,XMM0)

1028:       /* Second Column */
1029:       SSE_COPY_PS(XMM1,XMM6)
1030:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1031:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1032:       SSE_SUB_PS(XMM7,XMM1)

1034:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1036:       /* Third Column */
1037:       SSE_COPY_PS(XMM2,XMM6)
1038:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1039:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1040:       SSE_SUB_PS(XMM7,XMM2)

1042:       /* Fourth Column */
1043:       SSE_COPY_PS(XMM3,XMM6)
1044:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1045:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1046:       SSE_SUB_PS(XMM7,XMM3)
1047:       SSE_INLINE_END_2
1048:       v += 16;
1049:     }
1050:     v    = aa + ai16;
1051:     ai16 = 16*diag[--i];
1052:     PREFETCH_NTA(aa+ai16+16);
1053:     /*
1054:        Scale the result by the diagonal 4x4 block,
1055:        which was inverted as part of the factorization
1056:     */
1057:     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1058:     /* First Column */
1059:     SSE_COPY_PS(XMM0,XMM7)
1060:     SSE_SHUFFLE(XMM0,XMM0,0x00)
1061:     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1063:     /* Second Column */
1064:     SSE_COPY_PS(XMM1,XMM7)
1065:     SSE_SHUFFLE(XMM1,XMM1,0x55)
1066:     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1067:     SSE_ADD_PS(XMM0,XMM1)

1069:     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1071:     /* Third Column */
1072:     SSE_COPY_PS(XMM2,XMM7)
1073:     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1074:     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1075:     SSE_ADD_PS(XMM0,XMM2)

1077:     /* Fourth Column */
1078:     SSE_COPY_PS(XMM3,XMM7)
1079:     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1080:     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1081:     SSE_ADD_PS(XMM0,XMM3)

1083:     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1084:     SSE_INLINE_END_3

1086:     /* Promote solution from float to double */
1087:     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

1089:     /* Apply reordering to t and stream into x.    */
1090:     /* This way, x doesn't pollute the cache.      */
1091:     /* Be careful with size: 2 doubles = 4 floats! */
1092:     idc = 4*(*c--);
1093:     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1094:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1095:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1097:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1098:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1100:     SSE_INLINE_END_2
1101:     v    = aa + ai16 + 16;
1102:     idt -= 4;
1103:   }

1105:   ISRestoreIndices(isrow,&rout);
1106:   ISRestoreIndices(iscol,&cout);
1107:   VecRestoreArray(bb,&b);
1108:   VecRestoreArray(xx,&x);
1109:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1110:   SSE_SCOPE_END;
1111:   return(0);
1112: }

1114: #endif

1118: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1119: {
1120:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1121:   IS                iscol=a->col,isrow=a->row;
1122:   PetscErrorCode    ierr;
1123:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1124:   PetscInt          i,nz,idx,idt,idc;
1125:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1126:   const MatScalar   *aa=a->a,*v;
1127:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1128:   const PetscScalar *b;

1131:   VecGetArrayRead(bb,&b);
1132:   VecGetArray(xx,&x);
1133:   t    = a->solve_work;

1135:   ISGetIndices(isrow,&rout); r = rout;
1136:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1138:   /* forward solve the lower triangular */
1139:   idx  = 3*(*r++);
1140:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1141:   for (i=1; i<n; i++) {
1142:     v   = aa + 9*ai[i];
1143:     vi  = aj + ai[i];
1144:     nz  = diag[i] - ai[i];
1145:     idx = 3*(*r++);
1146:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1147:     while (nz--) {
1148:       idx = 3*(*vi++);
1149:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1150:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1151:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1152:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1153:       v  += 9;
1154:     }
1155:     idx    = 3*i;
1156:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1157:   }
1158:   /* backward solve the upper triangular */
1159:   for (i=n-1; i>=0; i--) {
1160:     v   = aa + 9*diag[i] + 9;
1161:     vi  = aj + diag[i] + 1;
1162:     nz  = ai[i+1] - diag[i] - 1;
1163:     idt = 3*i;
1164:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1165:     while (nz--) {
1166:       idx = 3*(*vi++);
1167:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1168:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1169:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1170:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1171:       v  += 9;
1172:     }
1173:     idc      = 3*(*c--);
1174:     v        = aa + 9*diag[i];
1175:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1176:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1177:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1178:   }
1179:   ISRestoreIndices(isrow,&rout);
1180:   ISRestoreIndices(iscol,&cout);
1181:   VecRestoreArrayRead(bb,&b);
1182:   VecRestoreArray(xx,&x);
1183:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1184:   return(0);
1185: }

1189: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1190: {
1191:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1192:   IS                iscol=a->col,isrow=a->row;
1193:   PetscErrorCode    ierr;
1194:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1195:   PetscInt          i,nz,idx,idt,idc,m;
1196:   const PetscInt    *r,*c,*rout,*cout;
1197:   const MatScalar   *aa=a->a,*v;
1198:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1199:   const PetscScalar *b;

1202:   VecGetArrayRead(bb,&b);
1203:   VecGetArray(xx,&x);
1204:   t    = a->solve_work;

1206:   ISGetIndices(isrow,&rout); r = rout;
1207:   ISGetIndices(iscol,&cout); c = cout;

1209:   /* forward solve the lower triangular */
1210:   idx  = 3*r[0];
1211:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1212:   for (i=1; i<n; i++) {
1213:     v   = aa + 9*ai[i];
1214:     vi  = aj + ai[i];
1215:     nz  = ai[i+1] - ai[i];
1216:     idx = 3*r[i];
1217:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1218:     for (m=0; m<nz; m++) {
1219:       idx = 3*vi[m];
1220:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1221:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1222:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1223:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1224:       v  += 9;
1225:     }
1226:     idx    = 3*i;
1227:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1228:   }
1229:   /* backward solve the upper triangular */
1230:   for (i=n-1; i>=0; i--) {
1231:     v   = aa + 9*(adiag[i+1]+1);
1232:     vi  = aj + adiag[i+1]+1;
1233:     nz  = adiag[i] - adiag[i+1] - 1;
1234:     idt = 3*i;
1235:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1236:     for (m=0; m<nz; m++) {
1237:       idx = 3*vi[m];
1238:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1239:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1240:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1241:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1242:       v  += 9;
1243:     }
1244:     idc      = 3*c[i];
1245:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1246:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1247:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1248:   }
1249:   ISRestoreIndices(isrow,&rout);
1250:   ISRestoreIndices(iscol,&cout);
1251:   VecRestoreArrayRead(bb,&b);
1252:   VecRestoreArray(xx,&x);
1253:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1254:   return(0);
1255: }

1259: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1260: {
1261:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1262:   IS                iscol=a->col,isrow=a->row;
1263:   PetscErrorCode    ierr;
1264:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1265:   PetscInt          i,nz,idx,idt,idc;
1266:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1267:   const MatScalar   *aa=a->a,*v;
1268:   PetscScalar       *x,s1,s2,x1,x2,*t;
1269:   const PetscScalar *b;

1272:   VecGetArrayRead(bb,&b);
1273:   VecGetArray(xx,&x);
1274:   t    = a->solve_work;

1276:   ISGetIndices(isrow,&rout); r = rout;
1277:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1279:   /* forward solve the lower triangular */
1280:   idx  = 2*(*r++);
1281:   t[0] = b[idx]; t[1] = b[1+idx];
1282:   for (i=1; i<n; i++) {
1283:     v   = aa + 4*ai[i];
1284:     vi  = aj + ai[i];
1285:     nz  = diag[i] - ai[i];
1286:     idx = 2*(*r++);
1287:     s1  = b[idx]; s2 = b[1+idx];
1288:     while (nz--) {
1289:       idx = 2*(*vi++);
1290:       x1  = t[idx]; x2 = t[1+idx];
1291:       s1 -= v[0]*x1 + v[2]*x2;
1292:       s2 -= v[1]*x1 + v[3]*x2;
1293:       v  += 4;
1294:     }
1295:     idx    = 2*i;
1296:     t[idx] = s1; t[1+idx] = s2;
1297:   }
1298:   /* backward solve the upper triangular */
1299:   for (i=n-1; i>=0; i--) {
1300:     v   = aa + 4*diag[i] + 4;
1301:     vi  = aj + diag[i] + 1;
1302:     nz  = ai[i+1] - diag[i] - 1;
1303:     idt = 2*i;
1304:     s1  = t[idt]; s2 = t[1+idt];
1305:     while (nz--) {
1306:       idx = 2*(*vi++);
1307:       x1  = t[idx]; x2 = t[1+idx];
1308:       s1 -= v[0]*x1 + v[2]*x2;
1309:       s2 -= v[1]*x1 + v[3]*x2;
1310:       v  += 4;
1311:     }
1312:     idc      = 2*(*c--);
1313:     v        = aa + 4*diag[i];
1314:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1315:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1316:   }
1317:   ISRestoreIndices(isrow,&rout);
1318:   ISRestoreIndices(iscol,&cout);
1319:   VecRestoreArrayRead(bb,&b);
1320:   VecRestoreArray(xx,&x);
1321:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1322:   return(0);
1323: }

1327: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1328: {
1329:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1330:   IS                iscol=a->col,isrow=a->row;
1331:   PetscErrorCode    ierr;
1332:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1333:   PetscInt          i,nz,idx,jdx,idt,idc,m;
1334:   const PetscInt    *r,*c,*rout,*cout;
1335:   const MatScalar   *aa=a->a,*v;
1336:   PetscScalar       *x,s1,s2,x1,x2,*t;
1337:   const PetscScalar *b;

1340:   VecGetArrayRead(bb,&b);
1341:   VecGetArray(xx,&x);
1342:   t    = a->solve_work;

1344:   ISGetIndices(isrow,&rout); r = rout;
1345:   ISGetIndices(iscol,&cout); c = cout;

1347:   /* forward solve the lower triangular */
1348:   idx  = 2*r[0];
1349:   t[0] = b[idx]; t[1] = b[1+idx];
1350:   for (i=1; i<n; i++) {
1351:     v   = aa + 4*ai[i];
1352:     vi  = aj + ai[i];
1353:     nz  = ai[i+1] - ai[i];
1354:     idx = 2*r[i];
1355:     s1  = b[idx]; s2 = b[1+idx];
1356:     for (m=0; m<nz; m++) {
1357:       jdx = 2*vi[m];
1358:       x1  = t[jdx]; x2 = t[1+jdx];
1359:       s1 -= v[0]*x1 + v[2]*x2;
1360:       s2 -= v[1]*x1 + v[3]*x2;
1361:       v  += 4;
1362:     }
1363:     idx    = 2*i;
1364:     t[idx] = s1; t[1+idx] = s2;
1365:   }
1366:   /* backward solve the upper triangular */
1367:   for (i=n-1; i>=0; i--) {
1368:     v   = aa + 4*(adiag[i+1]+1);
1369:     vi  = aj + adiag[i+1]+1;
1370:     nz  = adiag[i] - adiag[i+1] - 1;
1371:     idt = 2*i;
1372:     s1  = t[idt]; s2 = t[1+idt];
1373:     for (m=0; m<nz; m++) {
1374:       idx = 2*vi[m];
1375:       x1  = t[idx]; x2 = t[1+idx];
1376:       s1 -= v[0]*x1 + v[2]*x2;
1377:       s2 -= v[1]*x1 + v[3]*x2;
1378:       v  += 4;
1379:     }
1380:     idc      = 2*c[i];
1381:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1382:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1383:   }
1384:   ISRestoreIndices(isrow,&rout);
1385:   ISRestoreIndices(iscol,&cout);
1386:   VecRestoreArrayRead(bb,&b);
1387:   VecRestoreArray(xx,&x);
1388:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1389:   return(0);
1390: }

1394: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1395: {
1396:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1397:   IS                iscol=a->col,isrow=a->row;
1398:   PetscErrorCode    ierr;
1399:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1400:   PetscInt          i,nz;
1401:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1402:   const MatScalar   *aa=a->a,*v;
1403:   PetscScalar       *x,s1,*t;
1404:   const PetscScalar *b;

1407:   if (!n) return(0);

1409:   VecGetArrayRead(bb,&b);
1410:   VecGetArray(xx,&x);
1411:   t    = a->solve_work;

1413:   ISGetIndices(isrow,&rout); r = rout;
1414:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1416:   /* forward solve the lower triangular */
1417:   t[0] = b[*r++];
1418:   for (i=1; i<n; i++) {
1419:     v  = aa + ai[i];
1420:     vi = aj + ai[i];
1421:     nz = diag[i] - ai[i];
1422:     s1 = b[*r++];
1423:     while (nz--) {
1424:       s1 -= (*v++)*t[*vi++];
1425:     }
1426:     t[i] = s1;
1427:   }
1428:   /* backward solve the upper triangular */
1429:   for (i=n-1; i>=0; i--) {
1430:     v  = aa + diag[i] + 1;
1431:     vi = aj + diag[i] + 1;
1432:     nz = ai[i+1] - diag[i] - 1;
1433:     s1 = t[i];
1434:     while (nz--) {
1435:       s1 -= (*v++)*t[*vi++];
1436:     }
1437:     x[*c--] = t[i] = aa[diag[i]]*s1;
1438:   }

1440:   ISRestoreIndices(isrow,&rout);
1441:   ISRestoreIndices(iscol,&cout);
1442:   VecRestoreArrayRead(bb,&b);
1443:   VecRestoreArray(xx,&x);
1444:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1445:   return(0);
1446: }

1450: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1451: {
1452:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1453:   IS                iscol = a->col,isrow = a->row;
1454:   PetscErrorCode    ierr;
1455:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1456:   const PetscInt    *rout,*cout,*r,*c;
1457:   PetscScalar       *x,*tmp,sum;
1458:   const PetscScalar *b;
1459:   const MatScalar   *aa = a->a,*v;

1462:   if (!n) return(0);

1464:   VecGetArrayRead(bb,&b);
1465:   VecGetArray(xx,&x);
1466:   tmp  = a->solve_work;

1468:   ISGetIndices(isrow,&rout); r = rout;
1469:   ISGetIndices(iscol,&cout); c = cout;

1471:   /* forward solve the lower triangular */
1472:   tmp[0] = b[r[0]];
1473:   v      = aa;
1474:   vi     = aj;
1475:   for (i=1; i<n; i++) {
1476:     nz  = ai[i+1] - ai[i];
1477:     sum = b[r[i]];
1478:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1479:     tmp[i] = sum;
1480:     v     += nz; vi += nz;
1481:   }

1483:   /* backward solve the upper triangular */
1484:   for (i=n-1; i>=0; i--) {
1485:     v   = aa + adiag[i+1]+1;
1486:     vi  = aj + adiag[i+1]+1;
1487:     nz  = adiag[i]-adiag[i+1]-1;
1488:     sum = tmp[i];
1489:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1490:     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1491:   }

1493:   ISRestoreIndices(isrow,&rout);
1494:   ISRestoreIndices(iscol,&cout);
1495:   VecRestoreArrayRead(bb,&b);
1496:   VecRestoreArray(xx,&x);
1497:   PetscLogFlops(2*a->nz - A->cmap->n);
1498:   return(0);
1499: }