Actual source code: sbstrmfact.c

petsc-3.4.5 2014-06-29
  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 4:  #include ../src/mat/impls/sbaij/seq/sbstream/sbstream.h

  6: extern PetscErrorCode MatDestroy_SeqSBSTRM(Mat A);


 11: PetscErrorCode MatSolve_SeqSBSTRM_4_inplace(Mat A,Vec bb,Vec xx)
 12: {
 13:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
 14:   IS             isrow=a->row;
 15:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
 17:   const PetscInt *r;
 18:   PetscInt       nz,*vj,k,idx;
 19:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

 21:   Mat_SeqSBSTRM   *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
 22:   MatScalar       *as     =sbstrm->as,*diag;
 23:   PetscScalar     tp0, tp1, tp2, tp3;
 24:   const MatScalar *v0, *v1, *v2, *v3;
 25:   PetscInt        slen;

 28:   slen = 4*(ai[mbs]-ai[0]);
 29:   v0   = as + 16*ai[0];
 30:   v1   = v0 + slen;
 31:   v2   = v1 + slen;
 32:   v3   = v2 + slen;

 34:   VecGetArray(bb,&b);
 35:   VecGetArray(xx,&x);
 36:   t    = a->solve_work;
 37:   ISGetIndices(isrow,&r);

 39:   /* solve U^T * D * y = b by forward substitution */
 40:   tp = t;
 41:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 42:     idx   = 4*r[k];
 43:     tp[0] = b[idx];
 44:     tp[1] = b[idx+1];
 45:     tp[2] = b[idx+2];
 46:     tp[3] = b[idx+3];
 47:     tp   += 4;
 48:   }

 50:   for (k=0; k<mbs; k++) {
 51:     vj = aj + ai[k];
 52:     tp = t + k*4;
 53:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
 54:     nz = ai[k+1] - ai[k];

 56:     tp = t + (*vj)*4;
 57:     while (nz--) {
 58:       tp[0]   += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3;
 59:       tp[1]   += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3;
 60:       tp[2]   += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3;
 61:       tp[3]   += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3;
 62:       vj++; tp = t + (*vj)*4;

 64:       v0 += 4; v1 += 4; v2 += 4; v3 += 4;
 65:     }

 67:     /* xk = inv(Dk)*(Dk*xk) */
 68:     diag  = as+k*16;          /* ptr to inv(Dk) */
 69:     tp    = t + k*4;
 70:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
 71:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
 72:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
 73:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
 74:   }

 76:   /* solve U*x = y by back substitution */
 77:   for (k=mbs-1; k>=0; k--) {
 78:     vj = aj + ai[k+1];
 79:     tp = t + k*4;
 80:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
 81:     nz = ai[k+1] - ai[k];

 83:     tp = t + (*vj)*4;
 84:     while (nz--) {
 85:       /* xk += U(k,* */
 86:       v0 -= 4; v1 -= 4; v2 -= 4; v3 -= 4;

 88:       vj--; tp = t + (*vj)*4;

 90:       tp0 = tp[0]; tp1 = tp[1]; tp2 = tp[2]; tp3 = tp[3];
 91:       x0 += v0[3]*tp3 + v0[2]*tp2 + v0[1]*tp1 + v0[0]*tp0;
 92:       x1 += v1[3]*tp3 + v1[2]*tp2 + v1[1]*tp1 + v1[0]*tp0;
 93:       x2 += v2[3]*tp3 + v2[2]*tp2 + v2[1]*tp1 + v2[0]*tp0;
 94:       x3 += v3[3]*tp3 + v3[2]*tp2 + v3[1]*tp1 + v3[0]*tp0;
 95:     }
 96:     tp    = t + k*4;
 97:     tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;

 99:     idx      = 4*r[k];
100:     x[idx]   = x0;
101:     x[idx+1] = x1;
102:     x[idx+2] = x2;
103:     x[idx+3] = x3;
104:   }

106:   ISRestoreIndices(isrow,&r);
107:   VecRestoreArray(bb,&b);
108:   VecRestoreArray(xx,&x);
109:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
110:   return(0);
111: }

115: PetscErrorCode MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
116: {
117:   MatScalar   *diag;
118:   PetscScalar *xp,x0,x1,x2,x3;
119:   PetscInt    nz,*vj,k;

121:   const MatScalar *v0, *v1, *v2, *v3;
122:   PetscInt        slen;

125:   slen = 4*(ai[mbs]-ai[0]);
126:   v0   = aa + 16*ai[0];
127:   v1   = v0 + slen;
128:   v2   = v1 + slen;
129:   v3   = v2 + slen;

131:   for (k=0; k<mbs; k++) {
132:     xp = x + k*4;

134:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */

136:     nz = ai[k+1] - ai[k];
137:     vj = aj + ai[k];
138:     xp = x + (*vj)*4;
139:     while (nz--) {
140:       /*  += U(k,^T*(Dk*xk) */
141:       xp[0]   += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3;
142:       xp[1]   += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3;
143:       xp[2]   += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3;
144:       xp[3]   += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3;
145:       vj++; xp = x + (*vj)*4;

147:       v0 += 4; v1 += 4; v2 += 4; v3 += 4;
148:     }
149:     /* xk = inv(Dk)*(Dk*xk) */
150:     diag  = aa+k*16;         /* ptr to inv(Dk) */
151:     xp    = x + k*4;
152:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
153:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
154:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
155:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
156:   }
157:   return(0);
158: }

162: PetscErrorCode MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
163: {
164:   PetscScalar *xp,x0,x1,x2,x3;
165:   PetscInt    nz,*vj,k;

167:   PetscScalar     xp0, xp1, xp2, xp3;
168:   const MatScalar *v0, *v1, *v2, *v3;
169:   PetscInt        slen;

172:   slen = 4*(ai[mbs]-ai[0]);
173:   v0   = aa + 16*ai[0]+4*(ai[mbs]-ai[0]);
174:   v1   = v0 + slen;
175:   v2   = v1 + slen;
176:   v3   = v2 + slen;

178:   for (k=mbs-1; k>=0; k--) {
179:     xp = x + k*4;
180:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
181:     nz = ai[k+1] - ai[k];
182:     vj = aj + ai[k+1];
183:     xp = x + (*vj)*4;
184:     while (nz--) {
185:       /* xk += U(k,* */
186:       v0 -= 4; v1 -= 4; v2 -= 4; v3 -=4;

188:       vj--; xp = x + (*vj)*4;

190:       xp0 = xp[0]; xp1 = xp[1]; xp2 = xp[2]; xp3 = xp[3];
191:       x0 += v0[3]*xp3 + v0[2]*xp2 + v0[1]*xp1 + v0[0]*xp0;
192:       x1 += v1[3]*xp3 + v1[2]*xp2 + v1[1]*xp1 + v1[0]*xp0;
193:       x2 += v2[3]*xp3 + v2[2]*xp2 + v2[1]*xp1 + v2[0]*xp0;
194:       x3 += v3[3]*xp3 + v3[2]*xp2 + v3[1]*xp1 + v3[0]*xp0;
195:     }
196:     xp = x + k*4;

198:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
199:   }
200:   return(0);
201: }

205: PetscErrorCode MatSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
206: {
207:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
208:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
209:   PetscScalar    *x,*b;

212:   Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
213:   MatScalar     *as     =sbstrm->as;

216: #if 0
217:   MatSolve_SeqSBSTRM_4_inplace(A, bb, xx);
218: #endif
219:   VecGetArray(bb,&b);
220:   VecGetArray(xx,&x);
221:   /* solve U^T * D * y = b by forward substitution */
222:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
223:   MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
224:   /* solve U*x = y by back substitution */
225:   MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
226:   VecRestoreArray(bb,&b);
227:   VecRestoreArray(xx,&x);
228:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
229:   return(0);
230: }

234: PetscErrorCode MatForwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
235: {
236:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
237:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
238:   PetscScalar    *x,*b;

241:   Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
242:   MatScalar     *as     =sbstrm->as;

245:   VecGetArray(bb,&b);
246:   VecGetArray(xx,&x);
247:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
248:   MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
249:   VecRestoreArray(bb,&b);
250:   VecRestoreArray(xx,&x);
251:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
252:   return(0);
253: }

257: PetscErrorCode MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
258: {
259:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
260:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
261:   PetscScalar    *x,*b;

264:   Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
265:   MatScalar     *as     =sbstrm->as;

268:   VecGetArray(bb,&b);
269:   VecGetArray(xx,&x);
270:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
271:   MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
272:   VecRestoreArray(bb,&b);
273:   VecRestoreArray(xx,&x);
274:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
275:   return(0);
276: }

280: PetscErrorCode MatSolve_SeqSBSTRM_5_inplace(Mat A,Vec bb,Vec xx)
281: {
282:   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
283:   IS             isrow=a->row;
284:   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
286:   const PetscInt *r;
287:   PetscInt       nz,*vj,k,idx;
288:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

290:   Mat_SeqSBSTRM   *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
291:   MatScalar       *as     =sbstrm->as,*diag;
292:   PetscScalar     tp0, tp1, tp2, tp3, tp4;
293:   const MatScalar *v0, *v1, *v2, *v3, *v4;
294:   PetscInt        slen;

297:   VecGetArray(bb,&b);
298:   VecGetArray(xx,&x);
299:   t    = a->solve_work;
300:   ISGetIndices(isrow,&r);


303:   slen = 5*(ai[mbs]-ai[0]);
304:   v0   = as + 25*ai[0];
305:   v1   = v0 + slen;
306:   v2   = v1 + slen;
307:   v3   = v2 + slen;
308:   v4   = v3 + slen;

310:   /* solve U^T * D * y = b by forward substitution */
311:   tp = t;
312:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
313:     idx   = 5*r[k];
314:     tp[0] = b[idx];
315:     tp[1] = b[idx+1];
316:     tp[2] = b[idx+2];
317:     tp[3] = b[idx+3];
318:     tp[4] = b[idx+4];
319:     tp   += 5;
320:   }

322:   for (k=0; k<mbs; k++) {
323:     vj = aj + ai[k];
324:     tp = t + k*5;
325:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
326:     nz = ai[k+1] - ai[k];

328:     tp = t + (*vj)*5;
329:     while (nz--) {
330:       tp[0]   += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
331:       tp[1]   += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
332:       tp[2]   += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
333:       tp[3]   += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
334:       tp[4]   += v0[4]*x0 + v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4;
335:       vj++; tp = t + (*vj)*5;

337:       v0 += 5; v1 += 5; v2 += 5; v3 +=5; v4 += 5;
338:     }

340:     /* xk = inv(Dk)*(Dk*xk) */
341:     diag  = as+k*25;          /* ptr to inv(Dk) */
342:     tp    = t + k*5;
343:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
344:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
345:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
346:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
347:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
348:   }

350:   /* solve U*x = y by back substitution */
351:   for (k=mbs-1; k>=0; k--) {
352:     vj = aj + ai[k+1];
353:     tp = t + k*5;
354:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
355:     nz = ai[k+1] - ai[k];

357:     tp = t + (*vj)*5;
358:     while (nz--) {
359:       /* xk += U(k,* */
360:       v0 -= 5; v1 -= 5; v2 -= 5; v3 -=5; v4 -= 5;

362:       vj--; tp = t + (*vj)*5;
363:       tp0 = tp[0]; tp1 = tp[1]; tp2 = tp[2]; tp3 = tp[3]; tp4 = tp[4];
364:       x0 += v0[4]*tp4 + v0[3]*tp3 + v0[2]*tp2 + v0[1]*tp1 + v0[0]*tp0;
365:       x1 += v1[4]*tp4 + v1[3]*tp3 + v1[2]*tp2 + v1[1]*tp1 + v1[0]*tp0;
366:       x2 += v2[4]*tp4 + v2[3]*tp3 + v2[2]*tp2 + v2[1]*tp1 + v2[0]*tp0;
367:       x3 += v3[4]*tp4 + v3[3]*tp3 + v3[2]*tp2 + v3[1]*tp1 + v3[0]*tp0;
368:       x4 += v4[4]*tp4 + v4[3]*tp3 + v4[2]*tp2 + v4[1]*tp1 + v4[0]*tp0;
369:     }
370:     tp    = t + k*5;
371:     tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
372:     idx   = 5*r[k];

374:     x[idx]   = x0;
375:     x[idx+1] = x1;
376:     x[idx+2] = x2;
377:     x[idx+3] = x3;
378:     x[idx+4] = x4;
379:   }

381:   ISRestoreIndices(isrow,&r);
382:   VecRestoreArray(bb,&b);
383:   VecRestoreArray(xx,&x);
384:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
385:   return(0);
386: }

390: PetscErrorCode MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
391: {
392:   MatScalar   *diag;
393:   PetscScalar *xp,x0,x1,x2,x3,x4;
394:   PetscInt    nz,*vj,k;

396:   const MatScalar *v0, *v1, *v2, *v3, *v4;
397:   PetscInt        slen;

400:   slen = 5*(ai[mbs]-ai[0]);
401:   v0   = aa + 25*ai[0];
402:   v1   = v0 + slen;
403:   v2   = v1 + slen;
404:   v3   = v2 + slen;
405:   v4   = v3 + slen;

407:   for (k=0; k<mbs; k++) {
408:     xp = x + k*5;
409:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* Dk*xk = k-th block of x */
410:     nz = ai[k+1] - ai[k];
411:     vj = aj + ai[k];
412:     xp = x + (*vj)*5;
413:     while (nz--) {
414:       /*  += U(k,^T*(Dk*xk) */
415:       xp[0] += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
416:       xp[1] += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
417:       xp[2] += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
418:       xp[3] += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
419:       xp[4] += v0[4]*x0 + v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4;

421:       vj++; xp = x + (*vj)*5;

423:       v0 += 5; v1 += 5; v2 += 5; v3 +=5; v4 += 5;
424:     }
425:     /* xk = inv(Dk)*(Dk*xk) */
426:     diag  = aa+k*25;         /* ptr to inv(Dk) */
427:     xp    = x + k*5;
428:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
429:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
430:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
431:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
432:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
433:   }
434:   return(0);
435: }

439: PetscErrorCode MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
440: {
441:   PetscScalar *xp,x0,x1,x2,x3,x4;
442:   PetscInt    nz,*vj,k;

444:   PetscScalar     xp0, xp1, xp2, xp3, xp4;
445:   const MatScalar *v0, *v1, *v2, *v3, *v4;
446:   PetscInt        slen;

449:   slen = 5*(ai[mbs]-ai[0]);
450:   v0   = aa + 25*ai[0]+5*(ai[mbs]-ai[0]);
451:   v1   = v0 + slen;
452:   v2   = v1 + slen;
453:   v3   = v2 + slen;
454:   v4   = v3 + slen;

456:   for (k=mbs-1; k>=0; k--) {

458:     xp = x + k*5;
459:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
460:     nz = ai[k+1] - ai[k];

462:     vj = aj + ai[k+1];
463:     xp = x + (*vj)*5;

465:     while (nz--) {
466:       /* xk += U(k,* */
467:       v0 -= 5; v1 -= 5; v2 -= 5; v3 -=5; v4 -= 5;

469:       vj--; xp = x + (*vj)*5;

471:       xp4 = xp[4]; xp3 = xp[3]; xp2 = xp[2]; xp1 = xp[1]; xp0 = xp[0];
472:       x0 += v0[4]*xp4 + v0[3]*xp3 + v0[2]*xp2 + v0[1]*xp1 + v0[0]*xp0;
473:       x1 += v1[4]*xp4 + v1[3]*xp3 + v1[2]*xp2 + v1[1]*xp1 + v1[0]*xp0;
474:       x2 += v2[4]*xp4 + v2[3]*xp3 + v2[2]*xp2 + v2[1]*xp1 + v2[0]*xp0;
475:       x3 += v3[4]*xp4 + v3[3]*xp3 + v3[2]*xp2 + v3[1]*xp1 + v3[0]*xp0;
476:       x4 += v4[4]*xp4 + v4[3]*xp3 + v4[2]*xp2 + v4[1]*xp1 + v4[0]*xp0;
477:     }
478:     xp = x + k*5;

480:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
481:   }
482:   return(0);
483: }

487: PetscErrorCode MatSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
488: {
489:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
490:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
491:   PetscScalar    *x,*b;

494:   Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
495:   MatScalar     *as     =sbstrm->as;

498: #if 0
499: #endif
500:   VecGetArray(bb,&b);
501:   VecGetArray(xx,&x);

503:   /* solve U^T * D * y = b by forward substitution */
504:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
505:   MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);

507:   /* solve U*x = y by back substitution */
508:   MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);

510:   VecRestoreArray(bb,&b);
511:   VecRestoreArray(xx,&x);
512:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
513:   return(0);
514: }

518: PetscErrorCode MatForwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
519: {
520:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
521:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
522:   PetscScalar    *x,*b;

525:   Mat_SeqSBSTRM *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
526:   MatScalar     *as     =sbstrm->as;

529:   VecGetArray(bb,&b);
530:   VecGetArray(xx,&x);
531:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
532:   MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);
533:   VecRestoreArray(bb,&b);
534:   VecRestoreArray(xx,&x);
535:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
536:   return(0);
537: }

541: PetscErrorCode MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
542: {
543:   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
544:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
545:   PetscScalar    *x,*b;
547:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
548:   MatScalar      *as     =sbstrm->as;

551:   VecGetArray(bb,&b);
552:   VecGetArray(xx,&x);
553:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
554:   MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);
555:   VecRestoreArray(bb,&b);
556:   VecRestoreArray(xx,&x);
557:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
558:   return(0);
559: }


564: PetscErrorCode SeqSBSTRM_convertFact_sbstrm(Mat F)
565: {
566:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*) F->data;
567:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*) F->spptr;
568:   PetscInt       m       = a->mbs, bs = F->rmap->bs;
569:   PetscInt       *ai     = a->i;
570:   PetscInt       i,j,ib,jb;
571:   MatScalar      *aa = a->a, *aau, *asu;
573:   PetscInt       bs2, rbs,  cbs, blen, slen;
574:   PetscScalar    **asp;

577:   sbstrm->rbs = bs;
578:   sbstrm->cbs = bs;

580:   rbs  = cbs = bs;
581:   bs2  = rbs*cbs;
582:   blen = ai[m]-ai[0];
583:   slen = blen*cbs;

585:   if (sbstrm->as) {
586:     PetscFree(sbstrm->as);
587:   }
588:   PetscMalloc(bs2*ai[m]*sizeof(MatScalar), &sbstrm->as);
589:   PetscMalloc(rbs*sizeof(MatScalar*), &asp);

591:   asu = sbstrm->as;
592:   for (i=0; i<m*bs2; i++) asu[i] = aa[i];

594:   asu = sbstrm->as + ai[0]*bs2;
595:   aau = aa         + ai[0]*bs2;

597:   for (i=0; i<rbs; i++) asp[i] = asu + i*slen;

599:   for (j=0;j<blen;j++) {
600:     for (jb=0; jb<cbs; jb++) {
601:       for (ib=0; ib<rbs; ib++) asp[ib][j*cbs+jb] = aau[j*bs2+jb*rbs+ib];
602:     }
603:   }

605:   switch (bs) {
606:   case 4:
607:     F->ops->forwardsolve  = MatForwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
608:     F->ops->backwardsolve = MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
609:     F->ops->solve         = MatSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
610:     break;
611:   case 5:
612:     F->ops->forwardsolve  = MatForwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
613:     F->ops->backwardsolve = MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
614:     F->ops->solve         = MatSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
615:     break;
616:   default:
617:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
618:   }
619: #if 0
620: #endif

622:   PetscFree(asp);
623:   return(0);
624: }

626: /*=========================================================*/

630: PetscErrorCode MatFactorGetSolverPackage_seqsbaij_sbstrm(Mat A,const MatSolverPackage *type)
631: {
633:   *type = MATSOLVERSBSTRM;
634:   return(0);
635: }

639: PetscErrorCode MatCholeskyFactorNumeric_sbstrm(Mat F,Mat A,const MatFactorInfo *info)
640: {
641:   PetscInt       bs = A->rmap->bs;

645:   switch (bs) {
646:   case 4:
647:     MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(F,A,info);
648:     break;
649:   case 5:
650:     MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(F,A,info);
651:     break;
652:   default:
653:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
654:   }

656:   SeqSBSTRM_convertFact_sbstrm(F);
657:   return(0);
658: }
659: /*=========================================================*/
662: PetscErrorCode MatICCFactorSymbolic_sbstrm(Mat B,Mat A,IS perm,const MatFactorInfo *info)
663: {
664:   PetscInt ierr;

667:   (MatICCFactorSymbolic_SeqSBAIJ)(B,A,perm,info);

669:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_sbstrm;
670:   return(0);
671: }
672: /*=========================================================*/
675: PetscErrorCode MatCholeskyFactorSymbolic_sbstrm(Mat B,Mat A,IS perm,const MatFactorInfo *info)
676: {
677:   PetscInt ierr;

680:   (MatCholeskyFactorSymbolic_SeqSBAIJ)(B,A,perm,info);

682:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_sbstrm;
683:   return(0);
684: }
685: /*=========================================================*/

689: PETSC_EXTERN PetscErrorCode MatGetFactor_seqsbaij_sbstrm(Mat A,MatFactorType ftype,Mat *F)
690: {
691:   Mat            B;
692:   PetscInt       bs = A->rmap->bs;
693:   Mat_SeqSBSTRM  *sbstrm;

697:   if (A->cmap->N != A->rmap->N) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
698:   MatCreate(PetscObjectComm((PetscObject)A),&B);
699:   MatSetSizes(B,A->rmap->n,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
700:   MatSetType(B,((PetscObject)A)->type_name);
701:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,NULL);


704:   B->ops->iccfactorsymbolic      = MatICCFactorSymbolic_sbstrm;
705:   B->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_sbstrm;
706:   B->ops->choleskyfactornumeric  = MatCholeskyFactorNumeric_sbstrm;

708:   B->ops->destroy = MatDestroy_SeqSBSTRM;
709:   B->factortype   = ftype;
710:   B->assembled    = PETSC_TRUE;                  /* required by -ksp_view */
711:   B->preallocated = PETSC_TRUE;

713:   PetscNewLog(B,Mat_SeqSBSTRM,&sbstrm);

715:   PetscObjectComposeFunction((PetscObject)B,"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqsbaij_sbstrm);

717:   B->spptr = sbstrm;
718:   *F       = B;
719:   return(0);
720: }