Actual source code: baijsolv.c

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1:  #include <../src/mat/impls/baij/seq/baij.h>
  2:  #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
  7:   IS                iscol=a->col,isrow=a->row;
  8:   PetscErrorCode    ierr;
  9:   const PetscInt    *r,*c,*rout,*cout;
 10:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
 11:   PetscInt          i,nz;
 12:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
 13:   const MatScalar   *aa=a->a,*v;
 14:   PetscScalar       *x,*s,*t,*ls;
 15:   const PetscScalar *b;

 18:   VecGetArrayRead(bb,&b);
 19:   VecGetArray(xx,&x);
 20:   t    = a->solve_work;

 22:   ISGetIndices(isrow,&rout); r = rout;
 23:   ISGetIndices(iscol,&cout); c = cout + (n-1);

 25:   /* forward solve the lower triangular */
 26:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
 27:   for (i=1; i<n; i++) {
 28:     v    = aa + bs2*ai[i];
 29:     vi   = aj + ai[i];
 30:     nz   = a->diag[i] - ai[i];
 31:     s    = t + bs*i;
 32:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
 33:     while (nz--) {
 34:       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
 35:       v += bs2;
 36:     }
 37:   }
 38:   /* backward solve the upper triangular */
 39:   ls = a->solve_work + A->cmap->n;
 40:   for (i=n-1; i>=0; i--) {
 41:     v    = aa + bs2*(a->diag[i] + 1);
 42:     vi   = aj + a->diag[i] + 1;
 43:     nz   = ai[i+1] - a->diag[i] - 1;
 44:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
 45:     while (nz--) {
 46:       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
 47:       v += bs2;
 48:     }
 49:     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
 50:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
 51:   }

 53:   ISRestoreIndices(isrow,&rout);
 54:   ISRestoreIndices(iscol,&cout);
 55:   VecRestoreArrayRead(bb,&b);
 56:   VecRestoreArray(xx,&x);
 57:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
 58:   return(0);
 59: }

 61: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
 62: {
 63:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
 64:   IS                iscol=a->col,isrow=a->row;
 65:   PetscErrorCode    ierr;
 66:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
 67:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
 68:   PetscInt          i,nz,idx,idt,idc;
 69:   const MatScalar   *aa=a->a,*v;
 70:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
 71:   const PetscScalar *b;

 74:   VecGetArrayRead(bb,&b);
 75:   VecGetArray(xx,&x);
 76:   t    = a->solve_work;

 78:   ISGetIndices(isrow,&rout); r = rout;
 79:   ISGetIndices(iscol,&cout); c = cout + (n-1);

 81:   /* forward solve the lower triangular */
 82:   idx  = 7*(*r++);
 83:   t[0] = b[idx];   t[1] = b[1+idx];
 84:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
 85:   t[5] = b[5+idx]; t[6] = b[6+idx];

 87:   for (i=1; i<n; i++) {
 88:     v   = aa + 49*ai[i];
 89:     vi  = aj + ai[i];
 90:     nz  = diag[i] - ai[i];
 91:     idx = 7*(*r++);
 92:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
 93:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
 94:     while (nz--) {
 95:       idx = 7*(*vi++);
 96:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
 97:       x4  = t[3+idx];x5 = t[4+idx];
 98:       x6  = t[5+idx];x7 = t[6+idx];
 99:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
100:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
101:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
102:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
103:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
104:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
105:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
106:       v  += 49;
107:     }
108:     idx      = 7*i;
109:     t[idx]   = s1;t[1+idx] = s2;
110:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
111:     t[5+idx] = s6;t[6+idx] = s7;
112:   }
113:   /* backward solve the upper triangular */
114:   for (i=n-1; i>=0; i--) {
115:     v   = aa + 49*diag[i] + 49;
116:     vi  = aj + diag[i] + 1;
117:     nz  = ai[i+1] - diag[i] - 1;
118:     idt = 7*i;
119:     s1  = t[idt];  s2 = t[1+idt];
120:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
121:     s6  = t[5+idt];s7 = t[6+idt];
122:     while (nz--) {
123:       idx = 7*(*vi++);
124:       x1  = t[idx];   x2 = t[1+idx];
125:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
126:       x6  = t[5+idx]; x7 = t[6+idx];
127:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
128:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
129:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
130:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
131:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
132:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
133:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134:       v  += 49;
135:     }
136:     idc    = 7*(*c--);
137:     v      = aa + 49*diag[i];
138:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
139:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
140:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
141:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
142:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
143:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
144:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
145:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
146:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
147:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
148:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
149:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
150:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
151:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
152:   }

154:   ISRestoreIndices(isrow,&rout);
155:   ISRestoreIndices(iscol,&cout);
156:   VecRestoreArrayRead(bb,&b);
157:   VecRestoreArray(xx,&x);
158:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
159:   return(0);
160: }

162: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
163: {
164:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
165:   IS                iscol=a->col,isrow=a->row;
166:   PetscErrorCode    ierr;
167:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
168:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
169:   PetscInt          i,nz,idx,idt,idc,m;
170:   const MatScalar   *aa=a->a,*v;
171:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
172:   const PetscScalar *b;

175:   VecGetArrayRead(bb,&b);
176:   VecGetArray(xx,&x);
177:   t    = a->solve_work;

179:   ISGetIndices(isrow,&rout); r = rout;
180:   ISGetIndices(iscol,&cout); c = cout;

182:   /* forward solve the lower triangular */
183:   idx  = 7*r[0];
184:   t[0] = b[idx];   t[1] = b[1+idx];
185:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
186:   t[5] = b[5+idx]; t[6] = b[6+idx];

188:   for (i=1; i<n; i++) {
189:     v   = aa + 49*ai[i];
190:     vi  = aj + ai[i];
191:     nz  = ai[i+1] - ai[i];
192:     idx = 7*r[i];
193:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
194:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
195:     for (m=0; m<nz; m++) {
196:       idx = 7*vi[m];
197:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
198:       x4  = t[3+idx];x5 = t[4+idx];
199:       x6  = t[5+idx];x7 = t[6+idx];
200:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
201:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
202:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
203:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
204:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
205:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
206:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
207:       v  += 49;
208:     }
209:     idx      = 7*i;
210:     t[idx]   = s1;t[1+idx] = s2;
211:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
212:     t[5+idx] = s6;t[6+idx] = s7;
213:   }
214:   /* backward solve the upper triangular */
215:   for (i=n-1; i>=0; i--) {
216:     v   = aa + 49*(adiag[i+1]+1);
217:     vi  = aj + adiag[i+1]+1;
218:     nz  = adiag[i] - adiag[i+1] - 1;
219:     idt = 7*i;
220:     s1  = t[idt];  s2 = t[1+idt];
221:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
222:     s6  = t[5+idt];s7 = t[6+idt];
223:     for (m=0; m<nz; m++) {
224:       idx = 7*vi[m];
225:       x1  = t[idx];   x2 = t[1+idx];
226:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
227:       x6  = t[5+idx]; x7 = t[6+idx];
228:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
229:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
230:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
231:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
232:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
233:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
234:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
235:       v  += 49;
236:     }
237:     idc    = 7*c[i];
238:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
239:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
240:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
241:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
242:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
243:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
244:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
245:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
246:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
247:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
248:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
249:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
250:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
251:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
252:   }

254:   ISRestoreIndices(isrow,&rout);
255:   ISRestoreIndices(iscol,&cout);
256:   VecRestoreArrayRead(bb,&b);
257:   VecRestoreArray(xx,&x);
258:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
259:   return(0);
260: }

262: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
263: {
264:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
265:   IS                iscol=a->col,isrow=a->row;
266:   PetscErrorCode    ierr;
267:   const PetscInt    *r,*c,*rout,*cout;
268:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
269:   PetscInt          i,nz,idx,idt,idc;
270:   const MatScalar   *aa=a->a,*v;
271:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
272:   const PetscScalar *b;

275:   VecGetArrayRead(bb,&b);
276:   VecGetArray(xx,&x);
277:   t    = a->solve_work;

279:   ISGetIndices(isrow,&rout); r = rout;
280:   ISGetIndices(iscol,&cout); c = cout + (n-1);

282:   /* forward solve the lower triangular */
283:   idx  = 6*(*r++);
284:   t[0] = b[idx];   t[1] = b[1+idx];
285:   t[2] = b[2+idx]; t[3] = b[3+idx];
286:   t[4] = b[4+idx]; t[5] = b[5+idx];
287:   for (i=1; i<n; i++) {
288:     v   = aa + 36*ai[i];
289:     vi  = aj + ai[i];
290:     nz  = diag[i] - ai[i];
291:     idx = 6*(*r++);
292:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
293:     s5  = b[4+idx]; s6 = b[5+idx];
294:     while (nz--) {
295:       idx = 6*(*vi++);
296:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
297:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
298:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
299:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
300:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
301:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
302:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
303:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
304:       v  += 36;
305:     }
306:     idx      = 6*i;
307:     t[idx]   = s1;t[1+idx] = s2;
308:     t[2+idx] = s3;t[3+idx] = s4;
309:     t[4+idx] = s5;t[5+idx] = s6;
310:   }
311:   /* backward solve the upper triangular */
312:   for (i=n-1; i>=0; i--) {
313:     v   = aa + 36*diag[i] + 36;
314:     vi  = aj + diag[i] + 1;
315:     nz  = ai[i+1] - diag[i] - 1;
316:     idt = 6*i;
317:     s1  = t[idt];  s2 = t[1+idt];
318:     s3  = t[2+idt];s4 = t[3+idt];
319:     s5  = t[4+idt];s6 = t[5+idt];
320:     while (nz--) {
321:       idx = 6*(*vi++);
322:       x1  = t[idx];   x2 = t[1+idx];
323:       x3  = t[2+idx]; x4 = t[3+idx];
324:       x5  = t[4+idx]; x6 = t[5+idx];
325:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
326:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
327:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
328:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
329:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
330:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
331:       v  += 36;
332:     }
333:     idc    = 6*(*c--);
334:     v      = aa + 36*diag[i];
335:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
336:                         v[18]*s4+v[24]*s5+v[30]*s6;
337:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
338:                           v[19]*s4+v[25]*s5+v[31]*s6;
339:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
340:                           v[20]*s4+v[26]*s5+v[32]*s6;
341:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
342:                           v[21]*s4+v[27]*s5+v[33]*s6;
343:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
344:                           v[22]*s4+v[28]*s5+v[34]*s6;
345:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
346:                           v[23]*s4+v[29]*s5+v[35]*s6;
347:   }

349:   ISRestoreIndices(isrow,&rout);
350:   ISRestoreIndices(iscol,&cout);
351:   VecRestoreArrayRead(bb,&b);
352:   VecRestoreArray(xx,&x);
353:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
354:   return(0);
355: }

357: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
358: {
359:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
360:   IS                iscol=a->col,isrow=a->row;
361:   PetscErrorCode    ierr;
362:   const PetscInt    *r,*c,*rout,*cout;
363:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
364:   PetscInt          i,nz,idx,idt,idc,m;
365:   const MatScalar   *aa=a->a,*v;
366:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
367:   const PetscScalar *b;

370:   VecGetArrayRead(bb,&b);
371:   VecGetArray(xx,&x);
372:   t    = a->solve_work;

374:   ISGetIndices(isrow,&rout); r = rout;
375:   ISGetIndices(iscol,&cout); c = cout;

377:   /* forward solve the lower triangular */
378:   idx  = 6*r[0];
379:   t[0] = b[idx];   t[1] = b[1+idx];
380:   t[2] = b[2+idx]; t[3] = b[3+idx];
381:   t[4] = b[4+idx]; t[5] = b[5+idx];
382:   for (i=1; i<n; i++) {
383:     v   = aa + 36*ai[i];
384:     vi  = aj + ai[i];
385:     nz  = ai[i+1] - ai[i];
386:     idx = 6*r[i];
387:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
388:     s5  = b[4+idx]; s6 = b[5+idx];
389:     for (m=0; m<nz; m++) {
390:       idx = 6*vi[m];
391:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
392:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
393:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
394:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
395:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
396:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
397:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
398:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
399:       v  += 36;
400:     }
401:     idx      = 6*i;
402:     t[idx]   = s1;t[1+idx] = s2;
403:     t[2+idx] = s3;t[3+idx] = s4;
404:     t[4+idx] = s5;t[5+idx] = s6;
405:   }
406:   /* backward solve the upper triangular */
407:   for (i=n-1; i>=0; i--) {
408:     v   = aa + 36*(adiag[i+1]+1);
409:     vi  = aj + adiag[i+1]+1;
410:     nz  = adiag[i] - adiag[i+1] - 1;
411:     idt = 6*i;
412:     s1  = t[idt];  s2 = t[1+idt];
413:     s3  = t[2+idt];s4 = t[3+idt];
414:     s5  = t[4+idt];s6 = t[5+idt];
415:     for (m=0; m<nz; m++) {
416:       idx = 6*vi[m];
417:       x1  = t[idx];   x2 = t[1+idx];
418:       x3  = t[2+idx]; x4 = t[3+idx];
419:       x5  = t[4+idx]; x6 = t[5+idx];
420:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
421:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
422:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
423:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
424:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
425:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
426:       v  += 36;
427:     }
428:     idc    = 6*c[i];
429:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
430:                         v[18]*s4+v[24]*s5+v[30]*s6;
431:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
432:                           v[19]*s4+v[25]*s5+v[31]*s6;
433:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
434:                           v[20]*s4+v[26]*s5+v[32]*s6;
435:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
436:                           v[21]*s4+v[27]*s5+v[33]*s6;
437:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
438:                           v[22]*s4+v[28]*s5+v[34]*s6;
439:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
440:                           v[23]*s4+v[29]*s5+v[35]*s6;
441:   }

443:   ISRestoreIndices(isrow,&rout);
444:   ISRestoreIndices(iscol,&cout);
445:   VecRestoreArrayRead(bb,&b);
446:   VecRestoreArray(xx,&x);
447:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
448:   return(0);
449: }

451: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
452: {
453:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
454:   IS                iscol=a->col,isrow=a->row;
455:   PetscErrorCode    ierr;
456:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
457:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
458:   PetscInt          i,nz,idx,idt,idc;
459:   const MatScalar   *aa=a->a,*v;
460:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
461:   const PetscScalar *b;

464:   VecGetArrayRead(bb,&b);
465:   VecGetArray(xx,&x);
466:   t    = a->solve_work;

468:   ISGetIndices(isrow,&rout); r = rout;
469:   ISGetIndices(iscol,&cout); c = cout + (n-1);

471:   /* forward solve the lower triangular */
472:   idx  = 5*(*r++);
473:   t[0] = b[idx];   t[1] = b[1+idx];
474:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
475:   for (i=1; i<n; i++) {
476:     v   = aa + 25*ai[i];
477:     vi  = aj + ai[i];
478:     nz  = diag[i] - ai[i];
479:     idx = 5*(*r++);
480:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
481:     s5  = b[4+idx];
482:     while (nz--) {
483:       idx = 5*(*vi++);
484:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
485:       x4  = t[3+idx];x5 = t[4+idx];
486:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
487:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
488:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
489:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
490:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
491:       v  += 25;
492:     }
493:     idx      = 5*i;
494:     t[idx]   = s1;t[1+idx] = s2;
495:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
496:   }
497:   /* backward solve the upper triangular */
498:   for (i=n-1; i>=0; i--) {
499:     v   = aa + 25*diag[i] + 25;
500:     vi  = aj + diag[i] + 1;
501:     nz  = ai[i+1] - diag[i] - 1;
502:     idt = 5*i;
503:     s1  = t[idt];  s2 = t[1+idt];
504:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
505:     while (nz--) {
506:       idx = 5*(*vi++);
507:       x1  = t[idx];   x2 = t[1+idx];
508:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
509:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
510:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
511:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
512:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
513:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
514:       v  += 25;
515:     }
516:     idc    = 5*(*c--);
517:     v      = aa + 25*diag[i];
518:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
519:                         v[15]*s4+v[20]*s5;
520:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
521:                           v[16]*s4+v[21]*s5;
522:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
523:                           v[17]*s4+v[22]*s5;
524:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
525:                           v[18]*s4+v[23]*s5;
526:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
527:                           v[19]*s4+v[24]*s5;
528:   }

530:   ISRestoreIndices(isrow,&rout);
531:   ISRestoreIndices(iscol,&cout);
532:   VecRestoreArrayRead(bb,&b);
533:   VecRestoreArray(xx,&x);
534:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
535:   return(0);
536: }

538: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
539: {
540:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
541:   IS                iscol=a->col,isrow=a->row;
542:   PetscErrorCode    ierr;
543:   const PetscInt    *r,*c,*rout,*cout;
544:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
545:   PetscInt          i,nz,idx,idt,idc,m;
546:   const MatScalar   *aa=a->a,*v;
547:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
548:   const PetscScalar *b;

551:   VecGetArrayRead(bb,&b);
552:   VecGetArray(xx,&x);
553:   t    = a->solve_work;

555:   ISGetIndices(isrow,&rout); r = rout;
556:   ISGetIndices(iscol,&cout); c = cout;

558:   /* forward solve the lower triangular */
559:   idx  = 5*r[0];
560:   t[0] = b[idx];   t[1] = b[1+idx];
561:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
562:   for (i=1; i<n; i++) {
563:     v   = aa + 25*ai[i];
564:     vi  = aj + ai[i];
565:     nz  = ai[i+1] - ai[i];
566:     idx = 5*r[i];
567:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
568:     s5  = b[4+idx];
569:     for (m=0; m<nz; m++) {
570:       idx = 5*vi[m];
571:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
572:       x4  = t[3+idx];x5 = t[4+idx];
573:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
574:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
575:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
576:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
577:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
578:       v  += 25;
579:     }
580:     idx      = 5*i;
581:     t[idx]   = s1;t[1+idx] = s2;
582:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
583:   }
584:   /* backward solve the upper triangular */
585:   for (i=n-1; i>=0; i--) {
586:     v   = aa + 25*(adiag[i+1]+1);
587:     vi  = aj + adiag[i+1]+1;
588:     nz  = adiag[i] - adiag[i+1] - 1;
589:     idt = 5*i;
590:     s1  = t[idt];  s2 = t[1+idt];
591:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
592:     for (m=0; m<nz; m++) {
593:       idx = 5*vi[m];
594:       x1  = t[idx];   x2 = t[1+idx];
595:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
596:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
597:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
598:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
599:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
600:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
601:       v  += 25;
602:     }
603:     idc    = 5*c[i];
604:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
605:                         v[15]*s4+v[20]*s5;
606:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
607:                           v[16]*s4+v[21]*s5;
608:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
609:                           v[17]*s4+v[22]*s5;
610:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
611:                           v[18]*s4+v[23]*s5;
612:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
613:                           v[19]*s4+v[24]*s5;
614:   }

616:   ISRestoreIndices(isrow,&rout);
617:   ISRestoreIndices(iscol,&cout);
618:   VecRestoreArrayRead(bb,&b);
619:   VecRestoreArray(xx,&x);
620:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
621:   return(0);
622: }

624: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
625: {
626:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
627:   IS                iscol=a->col,isrow=a->row;
628:   PetscErrorCode    ierr;
629:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
630:   PetscInt          i,nz,idx,idt,idc;
631:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
632:   const MatScalar   *aa=a->a,*v;
633:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
634:   const PetscScalar *b;

637:   VecGetArrayRead(bb,&b);
638:   VecGetArray(xx,&x);
639:   t    = a->solve_work;

641:   ISGetIndices(isrow,&rout); r = rout;
642:   ISGetIndices(iscol,&cout); c = cout + (n-1);

644:   /* forward solve the lower triangular */
645:   idx  = 4*(*r++);
646:   t[0] = b[idx];   t[1] = b[1+idx];
647:   t[2] = b[2+idx]; t[3] = b[3+idx];
648:   for (i=1; i<n; i++) {
649:     v   = aa + 16*ai[i];
650:     vi  = aj + ai[i];
651:     nz  = diag[i] - ai[i];
652:     idx = 4*(*r++);
653:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
654:     while (nz--) {
655:       idx = 4*(*vi++);
656:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
657:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
658:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
659:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
660:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
661:       v  += 16;
662:     }
663:     idx      = 4*i;
664:     t[idx]   = s1;t[1+idx] = s2;
665:     t[2+idx] = s3;t[3+idx] = s4;
666:   }
667:   /* backward solve the upper triangular */
668:   for (i=n-1; i>=0; i--) {
669:     v   = aa + 16*diag[i] + 16;
670:     vi  = aj + diag[i] + 1;
671:     nz  = ai[i+1] - diag[i] - 1;
672:     idt = 4*i;
673:     s1  = t[idt];  s2 = t[1+idt];
674:     s3  = t[2+idt];s4 = t[3+idt];
675:     while (nz--) {
676:       idx = 4*(*vi++);
677:       x1  = t[idx];   x2 = t[1+idx];
678:       x3  = t[2+idx]; x4 = t[3+idx];
679:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
680:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
681:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
682:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
683:       v  += 16;
684:     }
685:     idc      = 4*(*c--);
686:     v        = aa + 16*diag[i];
687:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
688:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
689:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
690:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
691:   }

693:   ISRestoreIndices(isrow,&rout);
694:   ISRestoreIndices(iscol,&cout);
695:   VecRestoreArrayRead(bb,&b);
696:   VecRestoreArray(xx,&x);
697:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
698:   return(0);
699: }

701: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
702: {
703:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
704:   IS                iscol=a->col,isrow=a->row;
705:   PetscErrorCode    ierr;
706:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
707:   PetscInt          i,nz,idx,idt,idc,m;
708:   const PetscInt    *r,*c,*rout,*cout;
709:   const MatScalar   *aa=a->a,*v;
710:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
711:   const PetscScalar *b;

714:   VecGetArrayRead(bb,&b);
715:   VecGetArray(xx,&x);
716:   t    = a->solve_work;

718:   ISGetIndices(isrow,&rout); r = rout;
719:   ISGetIndices(iscol,&cout); c = cout;

721:   /* forward solve the lower triangular */
722:   idx  = 4*r[0];
723:   t[0] = b[idx];   t[1] = b[1+idx];
724:   t[2] = b[2+idx]; t[3] = b[3+idx];
725:   for (i=1; i<n; i++) {
726:     v   = aa + 16*ai[i];
727:     vi  = aj + ai[i];
728:     nz  = ai[i+1] - ai[i];
729:     idx = 4*r[i];
730:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
731:     for (m=0; m<nz; m++) {
732:       idx = 4*vi[m];
733:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
734:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
735:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
736:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
737:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
738:       v  += 16;
739:     }
740:     idx      = 4*i;
741:     t[idx]   = s1;t[1+idx] = s2;
742:     t[2+idx] = s3;t[3+idx] = s4;
743:   }
744:   /* backward solve the upper triangular */
745:   for (i=n-1; i>=0; i--) {
746:     v   = aa + 16*(adiag[i+1]+1);
747:     vi  = aj + adiag[i+1]+1;
748:     nz  = adiag[i] - adiag[i+1] - 1;
749:     idt = 4*i;
750:     s1  = t[idt];  s2 = t[1+idt];
751:     s3  = t[2+idt];s4 = t[3+idt];
752:     for (m=0; m<nz; m++) {
753:       idx = 4*vi[m];
754:       x1  = t[idx];   x2 = t[1+idx];
755:       x3  = t[2+idx]; x4 = t[3+idx];
756:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
757:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
758:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
759:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
760:       v  += 16;
761:     }
762:     idc      = 4*c[i];
763:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
764:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
765:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
766:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
767:   }

769:   ISRestoreIndices(isrow,&rout);
770:   ISRestoreIndices(iscol,&cout);
771:   VecRestoreArrayRead(bb,&b);
772:   VecRestoreArray(xx,&x);
773:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
774:   return(0);
775: }

777: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
778: {
779:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
780:   IS                iscol=a->col,isrow=a->row;
781:   PetscErrorCode    ierr;
782:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
783:   PetscInt          i,nz,idx,idt,idc;
784:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
785:   const MatScalar   *aa=a->a,*v;
786:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
787:   PetscScalar       *x;
788:   const PetscScalar *b;

791:   VecGetArrayRead(bb,&b);
792:   VecGetArray(xx,&x);
793:   t    = (MatScalar*)a->solve_work;

795:   ISGetIndices(isrow,&rout); r = rout;
796:   ISGetIndices(iscol,&cout); c = cout + (n-1);

798:   /* forward solve the lower triangular */
799:   idx  = 4*(*r++);
800:   t[0] = (MatScalar)b[idx];
801:   t[1] = (MatScalar)b[1+idx];
802:   t[2] = (MatScalar)b[2+idx];
803:   t[3] = (MatScalar)b[3+idx];
804:   for (i=1; i<n; i++) {
805:     v   = aa + 16*ai[i];
806:     vi  = aj + ai[i];
807:     nz  = diag[i] - ai[i];
808:     idx = 4*(*r++);
809:     s1  = (MatScalar)b[idx];
810:     s2  = (MatScalar)b[1+idx];
811:     s3  = (MatScalar)b[2+idx];
812:     s4  = (MatScalar)b[3+idx];
813:     while (nz--) {
814:       idx = 4*(*vi++);
815:       x1  = t[idx];
816:       x2  = t[1+idx];
817:       x3  = t[2+idx];
818:       x4  = t[3+idx];
819:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
820:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
821:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
822:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
823:       v  += 16;
824:     }
825:     idx      = 4*i;
826:     t[idx]   = s1;
827:     t[1+idx] = s2;
828:     t[2+idx] = s3;
829:     t[3+idx] = s4;
830:   }
831:   /* backward solve the upper triangular */
832:   for (i=n-1; i>=0; i--) {
833:     v   = aa + 16*diag[i] + 16;
834:     vi  = aj + diag[i] + 1;
835:     nz  = ai[i+1] - diag[i] - 1;
836:     idt = 4*i;
837:     s1  = t[idt];
838:     s2  = t[1+idt];
839:     s3  = t[2+idt];
840:     s4  = t[3+idt];
841:     while (nz--) {
842:       idx = 4*(*vi++);
843:       x1  = t[idx];
844:       x2  = t[1+idx];
845:       x3  = t[2+idx];
846:       x4  = t[3+idx];
847:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
848:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
849:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
850:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
851:       v  += 16;
852:     }
853:     idc      = 4*(*c--);
854:     v        = aa + 16*diag[i];
855:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
856:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
857:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
858:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
859:     x[idc]   = (PetscScalar)t[idt];
860:     x[1+idc] = (PetscScalar)t[1+idt];
861:     x[2+idc] = (PetscScalar)t[2+idt];
862:     x[3+idc] = (PetscScalar)t[3+idt];
863:   }

865:   ISRestoreIndices(isrow,&rout);
866:   ISRestoreIndices(iscol,&cout);
867:   VecRestoreArrayRead(bb,&b);
868:   VecRestoreArray(xx,&x);
869:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
870:   return(0);
871: }

873: #if defined(PETSC_HAVE_SSE)

875: #include PETSC_HAVE_SSE

877: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
878: {
879:   /*
880:      Note: This code uses demotion of double
881:      to float when performing the mixed-mode computation.
882:      This may not be numerically reasonable for all applications.
883:   */
884:   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
885:   IS             iscol=a->col,isrow=a->row;
887:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
888:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
889:   MatScalar      *aa=a->a,*v;
890:   PetscScalar    *x,*b,*t;

892:   /* Make space in temp stack for 16 Byte Aligned arrays */
893:   float         ssealignedspace[11],*tmps,*tmpx;
894:   unsigned long offset;

897:   SSE_SCOPE_BEGIN;

899:   offset = (unsigned long)ssealignedspace % 16;
900:   if (offset) offset = (16 - offset)/4;
901:   tmps = &ssealignedspace[offset];
902:   tmpx = &ssealignedspace[offset+4];
903:   PREFETCH_NTA(aa+16*ai[1]);

905:   VecGetArray(bb,&b);
906:   VecGetArray(xx,&x);
907:   t    = a->solve_work;

909:   ISGetIndices(isrow,&rout); r = rout;
910:   ISGetIndices(iscol,&cout); c = cout + (n-1);

912:   /* forward solve the lower triangular */
913:   idx  = 4*(*r++);
914:   t[0] = b[idx];   t[1] = b[1+idx];
915:   t[2] = b[2+idx]; t[3] = b[3+idx];
916:   v    =  aa + 16*ai[1];

918:   for (i=1; i<n; ) {
919:     PREFETCH_NTA(&v[8]);
920:     vi  =  aj      + ai[i];
921:     nz  =  diag[i] - ai[i];
922:     idx =  4*(*r++);

924:     /* Demote sum from double to float */
925:     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
926:     LOAD_PS(tmps,XMM7);

928:     while (nz--) {
929:       PREFETCH_NTA(&v[16]);
930:       idx = 4*(*vi++);

932:       /* Demote solution (so far) from double to float */
933:       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

935:       /* 4x4 Matrix-Vector product with negative accumulation: */
936:       SSE_INLINE_BEGIN_2(tmpx,v)
937:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

939:       /* First Column */
940:       SSE_COPY_PS(XMM0,XMM6)
941:       SSE_SHUFFLE(XMM0,XMM0,0x00)
942:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
943:       SSE_SUB_PS(XMM7,XMM0)

945:       /* Second Column */
946:       SSE_COPY_PS(XMM1,XMM6)
947:       SSE_SHUFFLE(XMM1,XMM1,0x55)
948:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
949:       SSE_SUB_PS(XMM7,XMM1)

951:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

953:       /* Third Column */
954:       SSE_COPY_PS(XMM2,XMM6)
955:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
956:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
957:       SSE_SUB_PS(XMM7,XMM2)

959:       /* Fourth Column */
960:       SSE_COPY_PS(XMM3,XMM6)
961:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
962:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
963:       SSE_SUB_PS(XMM7,XMM3)
964:       SSE_INLINE_END_2

966:       v += 16;
967:     }
968:     idx = 4*i;
969:     v   = aa + 16*ai[++i];
970:     PREFETCH_NTA(v);
971:     STORE_PS(tmps,XMM7);

973:     /* Promote result from float to double */
974:     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
975:   }
976:   /* backward solve the upper triangular */
977:   idt  = 4*(n-1);
978:   ai16 = 16*diag[n-1];
979:   v    = aa + ai16 + 16;
980:   for (i=n-1; i>=0; ) {
981:     PREFETCH_NTA(&v[8]);
982:     vi = aj + diag[i] + 1;
983:     nz = ai[i+1] - diag[i] - 1;

985:     /* Demote accumulator from double to float */
986:     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
987:     LOAD_PS(tmps,XMM7);

989:     while (nz--) {
990:       PREFETCH_NTA(&v[16]);
991:       idx = 4*(*vi++);

993:       /* Demote solution (so far) from double to float */
994:       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

996:       /* 4x4 Matrix-Vector Product with negative accumulation: */
997:       SSE_INLINE_BEGIN_2(tmpx,v)
998:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1000:       /* First Column */
1001:       SSE_COPY_PS(XMM0,XMM6)
1002:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1003:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1004:       SSE_SUB_PS(XMM7,XMM0)

1006:       /* Second Column */
1007:       SSE_COPY_PS(XMM1,XMM6)
1008:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1009:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1010:       SSE_SUB_PS(XMM7,XMM1)

1012:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1014:       /* Third Column */
1015:       SSE_COPY_PS(XMM2,XMM6)
1016:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1017:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1018:       SSE_SUB_PS(XMM7,XMM2)

1020:       /* Fourth Column */
1021:       SSE_COPY_PS(XMM3,XMM6)
1022:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1023:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1024:       SSE_SUB_PS(XMM7,XMM3)
1025:       SSE_INLINE_END_2
1026:       v += 16;
1027:     }
1028:     v    = aa + ai16;
1029:     ai16 = 16*diag[--i];
1030:     PREFETCH_NTA(aa+ai16+16);
1031:     /*
1032:        Scale the result by the diagonal 4x4 block,
1033:        which was inverted as part of the factorization
1034:     */
1035:     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1036:     /* First Column */
1037:     SSE_COPY_PS(XMM0,XMM7)
1038:     SSE_SHUFFLE(XMM0,XMM0,0x00)
1039:     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1041:     /* Second Column */
1042:     SSE_COPY_PS(XMM1,XMM7)
1043:     SSE_SHUFFLE(XMM1,XMM1,0x55)
1044:     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1045:     SSE_ADD_PS(XMM0,XMM1)

1047:     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1049:     /* Third Column */
1050:     SSE_COPY_PS(XMM2,XMM7)
1051:     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1052:     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1053:     SSE_ADD_PS(XMM0,XMM2)

1055:     /* Fourth Column */
1056:     SSE_COPY_PS(XMM3,XMM7)
1057:     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1058:     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1059:     SSE_ADD_PS(XMM0,XMM3)

1061:     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1062:     SSE_INLINE_END_3

1064:     /* Promote solution from float to double */
1065:     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

1067:     /* Apply reordering to t and stream into x.    */
1068:     /* This way, x doesn't pollute the cache.      */
1069:     /* Be careful with size: 2 doubles = 4 floats! */
1070:     idc = 4*(*c--);
1071:     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1072:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1073:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1074:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1075:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1076:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1077:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1078:     SSE_INLINE_END_2
1079:     v    = aa + ai16 + 16;
1080:     idt -= 4;
1081:   }

1083:   ISRestoreIndices(isrow,&rout);
1084:   ISRestoreIndices(iscol,&cout);
1085:   VecRestoreArray(bb,&b);
1086:   VecRestoreArray(xx,&x);
1087:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1088:   SSE_SCOPE_END;
1089:   return(0);
1090: }

1092: #endif

1094: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1095: {
1096:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1097:   IS                iscol=a->col,isrow=a->row;
1098:   PetscErrorCode    ierr;
1099:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1100:   PetscInt          i,nz,idx,idt,idc;
1101:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1102:   const MatScalar   *aa=a->a,*v;
1103:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1104:   const PetscScalar *b;

1107:   VecGetArrayRead(bb,&b);
1108:   VecGetArray(xx,&x);
1109:   t    = a->solve_work;

1111:   ISGetIndices(isrow,&rout); r = rout;
1112:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1114:   /* forward solve the lower triangular */
1115:   idx  = 3*(*r++);
1116:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1117:   for (i=1; i<n; i++) {
1118:     v   = aa + 9*ai[i];
1119:     vi  = aj + ai[i];
1120:     nz  = diag[i] - ai[i];
1121:     idx = 3*(*r++);
1122:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1123:     while (nz--) {
1124:       idx = 3*(*vi++);
1125:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1126:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1127:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1128:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1129:       v  += 9;
1130:     }
1131:     idx    = 3*i;
1132:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1133:   }
1134:   /* backward solve the upper triangular */
1135:   for (i=n-1; i>=0; i--) {
1136:     v   = aa + 9*diag[i] + 9;
1137:     vi  = aj + diag[i] + 1;
1138:     nz  = ai[i+1] - diag[i] - 1;
1139:     idt = 3*i;
1140:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1141:     while (nz--) {
1142:       idx = 3*(*vi++);
1143:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1144:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1145:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1146:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1147:       v  += 9;
1148:     }
1149:     idc      = 3*(*c--);
1150:     v        = aa + 9*diag[i];
1151:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1152:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1153:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1154:   }
1155:   ISRestoreIndices(isrow,&rout);
1156:   ISRestoreIndices(iscol,&cout);
1157:   VecRestoreArrayRead(bb,&b);
1158:   VecRestoreArray(xx,&x);
1159:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1160:   return(0);
1161: }

1163: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1164: {
1165:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1166:   IS                iscol=a->col,isrow=a->row;
1167:   PetscErrorCode    ierr;
1168:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1169:   PetscInt          i,nz,idx,idt,idc,m;
1170:   const PetscInt    *r,*c,*rout,*cout;
1171:   const MatScalar   *aa=a->a,*v;
1172:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1173:   const PetscScalar *b;

1176:   VecGetArrayRead(bb,&b);
1177:   VecGetArray(xx,&x);
1178:   t    = a->solve_work;

1180:   ISGetIndices(isrow,&rout); r = rout;
1181:   ISGetIndices(iscol,&cout); c = cout;

1183:   /* forward solve the lower triangular */
1184:   idx  = 3*r[0];
1185:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1186:   for (i=1; i<n; i++) {
1187:     v   = aa + 9*ai[i];
1188:     vi  = aj + ai[i];
1189:     nz  = ai[i+1] - ai[i];
1190:     idx = 3*r[i];
1191:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1192:     for (m=0; m<nz; m++) {
1193:       idx = 3*vi[m];
1194:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1195:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1196:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1197:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1198:       v  += 9;
1199:     }
1200:     idx    = 3*i;
1201:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1202:   }
1203:   /* backward solve the upper triangular */
1204:   for (i=n-1; i>=0; i--) {
1205:     v   = aa + 9*(adiag[i+1]+1);
1206:     vi  = aj + adiag[i+1]+1;
1207:     nz  = adiag[i] - adiag[i+1] - 1;
1208:     idt = 3*i;
1209:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1210:     for (m=0; m<nz; m++) {
1211:       idx = 3*vi[m];
1212:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1213:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1214:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1215:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1216:       v  += 9;
1217:     }
1218:     idc      = 3*c[i];
1219:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1220:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1221:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1222:   }
1223:   ISRestoreIndices(isrow,&rout);
1224:   ISRestoreIndices(iscol,&cout);
1225:   VecRestoreArrayRead(bb,&b);
1226:   VecRestoreArray(xx,&x);
1227:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1228:   return(0);
1229: }

1231: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1232: {
1233:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1234:   IS                iscol=a->col,isrow=a->row;
1235:   PetscErrorCode    ierr;
1236:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1237:   PetscInt          i,nz,idx,idt,idc;
1238:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1239:   const MatScalar   *aa=a->a,*v;
1240:   PetscScalar       *x,s1,s2,x1,x2,*t;
1241:   const PetscScalar *b;

1244:   VecGetArrayRead(bb,&b);
1245:   VecGetArray(xx,&x);
1246:   t    = a->solve_work;

1248:   ISGetIndices(isrow,&rout); r = rout;
1249:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1251:   /* forward solve the lower triangular */
1252:   idx  = 2*(*r++);
1253:   t[0] = b[idx]; t[1] = b[1+idx];
1254:   for (i=1; i<n; i++) {
1255:     v   = aa + 4*ai[i];
1256:     vi  = aj + ai[i];
1257:     nz  = diag[i] - ai[i];
1258:     idx = 2*(*r++);
1259:     s1  = b[idx]; s2 = b[1+idx];
1260:     while (nz--) {
1261:       idx = 2*(*vi++);
1262:       x1  = t[idx]; x2 = t[1+idx];
1263:       s1 -= v[0]*x1 + v[2]*x2;
1264:       s2 -= v[1]*x1 + v[3]*x2;
1265:       v  += 4;
1266:     }
1267:     idx    = 2*i;
1268:     t[idx] = s1; t[1+idx] = s2;
1269:   }
1270:   /* backward solve the upper triangular */
1271:   for (i=n-1; i>=0; i--) {
1272:     v   = aa + 4*diag[i] + 4;
1273:     vi  = aj + diag[i] + 1;
1274:     nz  = ai[i+1] - diag[i] - 1;
1275:     idt = 2*i;
1276:     s1  = t[idt]; s2 = t[1+idt];
1277:     while (nz--) {
1278:       idx = 2*(*vi++);
1279:       x1  = t[idx]; x2 = t[1+idx];
1280:       s1 -= v[0]*x1 + v[2]*x2;
1281:       s2 -= v[1]*x1 + v[3]*x2;
1282:       v  += 4;
1283:     }
1284:     idc      = 2*(*c--);
1285:     v        = aa + 4*diag[i];
1286:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1287:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1288:   }
1289:   ISRestoreIndices(isrow,&rout);
1290:   ISRestoreIndices(iscol,&cout);
1291:   VecRestoreArrayRead(bb,&b);
1292:   VecRestoreArray(xx,&x);
1293:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1294:   return(0);
1295: }

1297: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1298: {
1299:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1300:   IS                iscol=a->col,isrow=a->row;
1301:   PetscErrorCode    ierr;
1302:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1303:   PetscInt          i,nz,idx,jdx,idt,idc,m;
1304:   const PetscInt    *r,*c,*rout,*cout;
1305:   const MatScalar   *aa=a->a,*v;
1306:   PetscScalar       *x,s1,s2,x1,x2,*t;
1307:   const PetscScalar *b;

1310:   VecGetArrayRead(bb,&b);
1311:   VecGetArray(xx,&x);
1312:   t    = a->solve_work;

1314:   ISGetIndices(isrow,&rout); r = rout;
1315:   ISGetIndices(iscol,&cout); c = cout;

1317:   /* forward solve the lower triangular */
1318:   idx  = 2*r[0];
1319:   t[0] = b[idx]; t[1] = b[1+idx];
1320:   for (i=1; i<n; i++) {
1321:     v   = aa + 4*ai[i];
1322:     vi  = aj + ai[i];
1323:     nz  = ai[i+1] - ai[i];
1324:     idx = 2*r[i];
1325:     s1  = b[idx]; s2 = b[1+idx];
1326:     for (m=0; m<nz; m++) {
1327:       jdx = 2*vi[m];
1328:       x1  = t[jdx]; x2 = t[1+jdx];
1329:       s1 -= v[0]*x1 + v[2]*x2;
1330:       s2 -= v[1]*x1 + v[3]*x2;
1331:       v  += 4;
1332:     }
1333:     idx    = 2*i;
1334:     t[idx] = s1; t[1+idx] = s2;
1335:   }
1336:   /* backward solve the upper triangular */
1337:   for (i=n-1; i>=0; i--) {
1338:     v   = aa + 4*(adiag[i+1]+1);
1339:     vi  = aj + adiag[i+1]+1;
1340:     nz  = adiag[i] - adiag[i+1] - 1;
1341:     idt = 2*i;
1342:     s1  = t[idt]; s2 = t[1+idt];
1343:     for (m=0; m<nz; m++) {
1344:       idx = 2*vi[m];
1345:       x1  = t[idx]; x2 = t[1+idx];
1346:       s1 -= v[0]*x1 + v[2]*x2;
1347:       s2 -= v[1]*x1 + v[3]*x2;
1348:       v  += 4;
1349:     }
1350:     idc      = 2*c[i];
1351:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1352:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1353:   }
1354:   ISRestoreIndices(isrow,&rout);
1355:   ISRestoreIndices(iscol,&cout);
1356:   VecRestoreArrayRead(bb,&b);
1357:   VecRestoreArray(xx,&x);
1358:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1359:   return(0);
1360: }

1362: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1363: {
1364:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1365:   IS                iscol=a->col,isrow=a->row;
1366:   PetscErrorCode    ierr;
1367:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1368:   PetscInt          i,nz;
1369:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1370:   const MatScalar   *aa=a->a,*v;
1371:   PetscScalar       *x,s1,*t;
1372:   const PetscScalar *b;

1375:   if (!n) return(0);

1377:   VecGetArrayRead(bb,&b);
1378:   VecGetArray(xx,&x);
1379:   t    = a->solve_work;

1381:   ISGetIndices(isrow,&rout); r = rout;
1382:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1384:   /* forward solve the lower triangular */
1385:   t[0] = b[*r++];
1386:   for (i=1; i<n; i++) {
1387:     v  = aa + ai[i];
1388:     vi = aj + ai[i];
1389:     nz = diag[i] - ai[i];
1390:     s1 = b[*r++];
1391:     while (nz--) {
1392:       s1 -= (*v++)*t[*vi++];
1393:     }
1394:     t[i] = s1;
1395:   }
1396:   /* backward solve the upper triangular */
1397:   for (i=n-1; i>=0; i--) {
1398:     v  = aa + diag[i] + 1;
1399:     vi = aj + diag[i] + 1;
1400:     nz = ai[i+1] - diag[i] - 1;
1401:     s1 = t[i];
1402:     while (nz--) {
1403:       s1 -= (*v++)*t[*vi++];
1404:     }
1405:     x[*c--] = t[i] = aa[diag[i]]*s1;
1406:   }

1408:   ISRestoreIndices(isrow,&rout);
1409:   ISRestoreIndices(iscol,&cout);
1410:   VecRestoreArrayRead(bb,&b);
1411:   VecRestoreArray(xx,&x);
1412:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1413:   return(0);
1414: }

1416: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1417: {
1418:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1419:   IS                iscol = a->col,isrow = a->row;
1420:   PetscErrorCode    ierr;
1421:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1422:   const PetscInt    *rout,*cout,*r,*c;
1423:   PetscScalar       *x,*tmp,sum;
1424:   const PetscScalar *b;
1425:   const MatScalar   *aa = a->a,*v;

1428:   if (!n) return(0);

1430:   VecGetArrayRead(bb,&b);
1431:   VecGetArray(xx,&x);
1432:   tmp  = a->solve_work;

1434:   ISGetIndices(isrow,&rout); r = rout;
1435:   ISGetIndices(iscol,&cout); c = cout;

1437:   /* forward solve the lower triangular */
1438:   tmp[0] = b[r[0]];
1439:   v      = aa;
1440:   vi     = aj;
1441:   for (i=1; i<n; i++) {
1442:     nz  = ai[i+1] - ai[i];
1443:     sum = b[r[i]];
1444:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1445:     tmp[i] = sum;
1446:     v     += nz; vi += nz;
1447:   }

1449:   /* backward solve the upper triangular */
1450:   for (i=n-1; i>=0; i--) {
1451:     v   = aa + adiag[i+1]+1;
1452:     vi  = aj + adiag[i+1]+1;
1453:     nz  = adiag[i]-adiag[i+1]-1;
1454:     sum = tmp[i];
1455:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1456:     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1457:   }

1459:   ISRestoreIndices(isrow,&rout);
1460:   ISRestoreIndices(iscol,&cout);
1461:   VecRestoreArrayRead(bb,&b);
1462:   VecRestoreArray(xx,&x);
1463:   PetscLogFlops(2*a->nz - A->cmap->n);
1464:   return(0);
1465: }