Actual source code: sbstrmfact.c

petsc-3.3-p7 2013-05-11
  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 4:  #include ../src/mat/impls/sbaij/seq/sbstream/sbstream.h

  6: extern PetscErrorCode MatDestroy_SeqSBSTRM(Mat A);


 11: PetscErrorCode MatSolve_SeqSBSTRM_4_inplace(Mat A,Vec bb,Vec xx)
 12: {
 13:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
 14:   IS             isrow=a->row;
 15:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
 17:   const PetscInt *r;
 18:   PetscInt       nz,*vj,k,idx;
 19:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

 21:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
 22:   MatScalar          *as=sbstrm->as,*diag;
 23:   PetscScalar        tp0, tp1, tp2, tp3;
 24:   const MatScalar    *v0, *v1, *v2, *v3;
 25:   PetscInt           slen;


 29:   slen = 4*(ai[mbs]-ai[0]);
 30:   v0  = as + 16*ai[0];
 31:   v1  = v0 + slen;
 32:   v2  = v1 + slen;
 33:   v3  = v2 + slen;

 35:   VecGetArray(bb,&b);
 36:   VecGetArray(xx,&x);
 37:   t  = a->solve_work;
 38:   ISGetIndices(isrow,&r);

 40:   /* solve U^T * D * y = b by forward substitution */
 41:   tp = t;
 42:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 43:     idx   = 4*r[k];
 44:     tp[0] = b[idx];
 45:     tp[1] = b[idx+1];
 46:     tp[2] = b[idx+2];
 47:     tp[3] = b[idx+3];
 48:     tp += 4;
 49:   }
 50: 
 51:   for (k=0; k<mbs; k++){
 52:     vj = aj + ai[k];
 53:     tp = t + k*4;
 54:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
 55:     nz = ai[k+1] - ai[k];

 57:     tp = t + (*vj)*4;
 58:     while (nz--) {
 59:       tp[0] += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3;
 60:       tp[1] += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3;
 61:       tp[2] += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3;
 62:       tp[3] += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3;
 63:       vj++; tp = t + (*vj)*4;
 64:       v0 += 4; v1 += 4; v2 += 4; v3 += 4;
 65:     }

 67:     /* xk = inv(Dk)*(Dk*xk) */
 68:     diag  = as+k*16;          /* ptr to inv(Dk) */
 69:     tp    = t + k*4;
 70:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
 71:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
 72:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
 73:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
 74:   }

 76:   /* solve U*x = y by back substitution */
 77:   for (k=mbs-1; k>=0; k--){
 78:     vj = aj + ai[k+1];
 79:     tp    = t + k*4;
 80:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
 81:     nz = ai[k+1] - ai[k];
 82: 
 83:     tp = t + (*vj)*4;
 84:     while (nz--) {
 85:       /* xk += U(k,* */
 86:       v0 -= 4; v1 -= 4; v2 -= 4; v3 -= 4;
 87:       vj--; tp = t + (*vj)*4;
 88:       tp0 = tp[0]; tp1 = tp[1]; tp2 = tp[2]; tp3 = tp[3];
 89:       x0 += v0[3]*tp3 + v0[2]*tp2 + v0[1]*tp1 + v0[0]*tp0;
 90:       x1 += v1[3]*tp3 + v1[2]*tp2 + v1[1]*tp1 + v1[0]*tp0;
 91:       x2 += v2[3]*tp3 + v2[2]*tp2 + v2[1]*tp1 + v2[0]*tp0;
 92:       x3 += v3[3]*tp3 + v3[2]*tp2 + v3[1]*tp1 + v3[0]*tp0;
 93:     }
 94:     tp    = t + k*4;
 95:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
 96:     idx        = 4*r[k];
 97:     x[idx]     = x0;
 98:     x[idx+1]   = x1;
 99:     x[idx+2]   = x2;
100:     x[idx+3]   = x3;
101:   }

103:   ISRestoreIndices(isrow,&r);
104:   VecRestoreArray(bb,&b);
105:   VecRestoreArray(xx,&x);
106:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
107:    return(0);
108: }

112: PetscErrorCode MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
113: {
114:   MatScalar      *diag;
115:   PetscScalar    *xp,x0,x1,x2,x3;
116:   PetscInt       nz,*vj,k;

118:   const MatScalar    *v0, *v1, *v2, *v3;
119:   PetscInt           slen;


123:   slen = 4*(ai[mbs]-ai[0]);
124:   v0  = aa + 16*ai[0];
125:   v1  = v0 + slen;
126:   v2  = v1 + slen;
127:   v3  = v2 + slen;

129:   for (k=0; k<mbs; k++){
130:     xp = x + k*4;
131:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
132:     nz = ai[k+1] - ai[k];
133:     vj = aj + ai[k];
134:     xp = x + (*vj)*4;
135:     while (nz--) {
136:       /*  += U(k,^T*(Dk*xk) */
137:       xp[0] += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3;
138:       xp[1] += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3;
139:       xp[2] += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3;
140:       xp[3] += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3;
141:       vj++; xp = x + (*vj)*4;
142:       v0 += 4; v1 += 4; v2 += 4; v3 += 4;
143:     }
144:     /* xk = inv(Dk)*(Dk*xk) */
145:     diag = aa+k*16;          /* ptr to inv(Dk) */
146:     xp   = x + k*4;
147:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
148:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
149:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
150:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
151:   }
152:   return(0);
153: }

157: PetscErrorCode MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
158: {
159:   PetscScalar    *xp,x0,x1,x2,x3;
160:   PetscInt       nz,*vj,k;

162:   PetscScalar        xp0, xp1, xp2, xp3;
163:   const MatScalar    *v0, *v1, *v2, *v3;
164:   PetscInt           slen;

167:   slen = 4*(ai[mbs]-ai[0]);
168:   v0  = aa + 16*ai[0]+4*(ai[mbs]-ai[0]);
169:   v1  = v0 + slen;
170:   v2  = v1 + slen;
171:   v3  = v2 + slen;

173:   for (k=mbs-1; k>=0; k--){
174:     xp = x + k*4;
175:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
176:     nz = ai[k+1] - ai[k];
177:     vj = aj + ai[k+1];
178:     xp = x + (*vj)*4;
179:     while (nz--) {
180:       /* xk += U(k,* */
181:       v0 -= 4; v1 -= 4; v2 -= 4; v3 -=4;
182:       vj--; xp = x + (*vj)*4;
183:       xp0 = xp[0]; xp1 = xp[1]; xp2 = xp[2]; xp3 = xp[3];
184:       x0 += v0[3]*xp3 + v0[2]*xp2 + v0[1]*xp1 + v0[0]*xp0;
185:       x1 += v1[3]*xp3 + v1[2]*xp2 + v1[1]*xp1 + v1[0]*xp0;
186:       x2 += v2[3]*xp3 + v2[2]*xp2 + v2[1]*xp1 + v2[0]*xp0;
187:       x3 += v3[3]*xp3 + v3[2]*xp2 + v3[1]*xp1 + v3[0]*xp0;
188:     }
189:     xp = x + k*4;
190:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
191:   }
192:   return(0);
193: }

197: PetscErrorCode MatSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
198: {
199:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
200:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
201:   PetscScalar    *x,*b;

204:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
205:   MatScalar        *as=sbstrm->as;

208: #if 0
209:   MatSolve_SeqSBSTRM_4_inplace(A, bb, xx);
210: #endif
211:   VecGetArray(bb,&b);
212:   VecGetArray(xx,&x);
213:   /* solve U^T * D * y = b by forward substitution */
214:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
215:   MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
216:   /* solve U*x = y by back substitution */
217:   MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
218:   VecRestoreArray(bb,&b);
219:   VecRestoreArray(xx,&x);
220:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
221:   return(0);
222: }

226: PetscErrorCode MatForwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
227: {
228:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
229:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
230:   PetscScalar    *x,*b;

233:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
234:   MatScalar        *as=sbstrm->as;

237:   VecGetArray(bb,&b);
238:   VecGetArray(xx,&x);
239:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
240:   MatForwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
241:   VecRestoreArray(bb,&b);
242:   VecRestoreArray(xx,&x);
243:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
244:   return(0);
245: }

249: PetscErrorCode MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
250: {
251:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
252:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
253:   PetscScalar    *x,*b;

256:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
257:   MatScalar        *as=sbstrm->as;

260:   VecGetArray(bb,&b);
261:   VecGetArray(xx,&x);
262:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
263:   MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering(ai,aj,as,mbs,x);
264:   VecRestoreArray(bb,&b);
265:   VecRestoreArray(xx,&x);
266:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
267:   return(0);
268: }

272: PetscErrorCode MatSolve_SeqSBSTRM_5_inplace(Mat A,Vec bb,Vec xx)
273: {
274:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
275:   IS             isrow=a->row;
276:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
278:   const PetscInt *r;
279:   PetscInt       nz,*vj,k,idx;
280:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

282:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
283:   MatScalar          *as=sbstrm->as,*diag;
284:   PetscScalar        tp0, tp1, tp2, tp3, tp4;
285:   const MatScalar    *v0, *v1, *v2, *v3, *v4;
286:   PetscInt           slen;

289:   VecGetArray(bb,&b);
290:   VecGetArray(xx,&x);
291:   t  = a->solve_work;
292:   ISGetIndices(isrow,&r);


295:   slen = 5*(ai[mbs]-ai[0]);
296:   v0  = as + 25*ai[0];
297:   v1  = v0 + slen;
298:   v2  = v1 + slen;
299:   v3  = v2 + slen;
300:   v4  = v3 + slen;

302:   /* solve U^T * D * y = b by forward substitution */
303:   tp = t;
304:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
305:     idx   = 5*r[k];
306:     tp[0] = b[idx];
307:     tp[1] = b[idx+1];
308:     tp[2] = b[idx+2];
309:     tp[3] = b[idx+3];
310:     tp[4] = b[idx+4];
311:     tp += 5;
312:   }
313: 
314:   for (k=0; k<mbs; k++){
315:     vj = aj + ai[k];
316:     tp = t + k*5;
317:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
318:     nz = ai[k+1] - ai[k];

320:     tp = t + (*vj)*5;
321:     while (nz--) {
322:       tp[0] += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
323:       tp[1] += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
324:       tp[2] += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
325:       tp[3] += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
326:       tp[4] += v0[4]*x0 + v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4;
327:       vj++; tp = t + (*vj)*5;
328:       v0 += 5; v1 += 5; v2 += 5; v3 +=5; v4 += 5;
329:     }

331:     /* xk = inv(Dk)*(Dk*xk) */
332:     diag  = as+k*25;          /* ptr to inv(Dk) */
333:     tp    = t + k*5;
334:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
335:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
336:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
337:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
338:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
339:   }

341:   /* solve U*x = y by back substitution */
342:   for (k=mbs-1; k>=0; k--){
343:     vj = aj + ai[k+1];
344:     tp    = t + k*5;
345:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
346:     nz = ai[k+1] - ai[k];
347: 
348:     tp = t + (*vj)*5;
349:     while (nz--) {
350:       /* xk += U(k,* */
351:       v0 -= 5; v1 -= 5; v2 -= 5; v3 -=5; v4 -= 5;
352:       vj--; tp = t + (*vj)*5;
353:       tp0 = tp[0]; tp1 = tp[1]; tp2 = tp[2]; tp3 = tp[3]; tp4 = tp[4];
354:       x0 += v0[4]*tp4 + v0[3]*tp3 + v0[2]*tp2 + v0[1]*tp1 + v0[0]*tp0;
355:       x1 += v1[4]*tp4 + v1[3]*tp3 + v1[2]*tp2 + v1[1]*tp1 + v1[0]*tp0;
356:       x2 += v2[4]*tp4 + v2[3]*tp3 + v2[2]*tp2 + v2[1]*tp1 + v2[0]*tp0;
357:       x3 += v3[4]*tp4 + v3[3]*tp3 + v3[2]*tp2 + v3[1]*tp1 + v3[0]*tp0;
358:       x4 += v4[4]*tp4 + v4[3]*tp3 + v4[2]*tp2 + v4[1]*tp1 + v4[0]*tp0;
359:     }
360:     tp    = t + k*5;
361:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
362:     idx   = 5*r[k];
363:     x[idx]     = x0;
364:     x[idx+1]   = x1;
365:     x[idx+2]   = x2;
366:     x[idx+3]   = x3;
367:     x[idx+4]   = x4;
368:   }

370:   ISRestoreIndices(isrow,&r);
371:   VecRestoreArray(bb,&b);
372:   VecRestoreArray(xx,&x);
373:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
374:   return(0);
375: }

379: PetscErrorCode MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
380: {
381:   MatScalar      *diag;
382:   PetscScalar    *xp,x0,x1,x2,x3,x4;
383:   PetscInt       nz,*vj,k;

385:   const MatScalar   *v0, *v1, *v2, *v3, *v4;
386:   PetscInt           slen;

389: 
390: 

392:   slen = 5*(ai[mbs]-ai[0]);
393:   v0  = aa + 25*ai[0];
394:   v1  = v0 + slen;
395:   v2  = v1 + slen;
396:   v3  = v2 + slen;
397:   v4  = v3 + slen;

399:   for (k=0; k<mbs; k++){
400:     xp = x + k*5;
401:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
402:     nz = ai[k+1] - ai[k];
403:     vj = aj + ai[k];
404:     xp = x + (*vj)*5;
405:     while (nz--) {
406:       /*  += U(k,^T*(Dk*xk) */
407:       xp[0] += v0[0]*x0 + v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
408:       xp[1] += v0[1]*x0 + v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
409:       xp[2] += v0[2]*x0 + v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
410:       xp[3] += v0[3]*x0 + v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
411:       xp[4] += v0[4]*x0 + v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4;
412:       vj++; xp = x + (*vj)*5;
413:       v0 += 5; v1 += 5; v2 += 5; v3 +=5; v4 += 5;
414:     }
415:     /* xk = inv(Dk)*(Dk*xk) */
416:     diag = aa+k*25;          /* ptr to inv(Dk) */
417:     xp   = x + k*5;
418:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
419:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
420:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
421:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
422:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
423:   }
424:   return(0);
425: }

429: PetscErrorCode MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
430: {
431:   PetscScalar    *xp,x0,x1,x2,x3,x4;
432:   PetscInt       nz,*vj,k;

434:   PetscScalar        xp0, xp1, xp2, xp3, xp4;
435:   const MatScalar    *v0, *v1, *v2, *v3, *v4;
436:   PetscInt           slen;



441:   slen = 5*(ai[mbs]-ai[0]);
442:   v0  = aa + 25*ai[0]+5*(ai[mbs]-ai[0]);
443:   v1  = v0 + slen;
444:   v2  = v1 + slen;
445:   v3  = v2 + slen;
446:   v4  = v3 + slen;

448:   for (k=mbs-1; k>=0; k--){

450:     xp = x + k*5;
451:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
452:     nz = ai[k+1] - ai[k];

454:     vj = aj + ai[k+1];
455:     xp = x + (*vj)*5;

457:     while (nz--) {
458:       /* xk += U(k,* */
459:       v0 -= 5; v1 -= 5; v2 -= 5; v3 -=5; v4 -= 5;
460:       vj--; xp = x + (*vj)*5;
461:       xp4 = xp[4]; xp3 = xp[3]; xp2 = xp[2]; xp1 = xp[1]; xp0 = xp[0];
462:       x0 += v0[4]*xp4 + v0[3]*xp3 + v0[2]*xp2 + v0[1]*xp1 + v0[0]*xp0;
463:       x1 += v1[4]*xp4 + v1[3]*xp3 + v1[2]*xp2 + v1[1]*xp1 + v1[0]*xp0;
464:       x2 += v2[4]*xp4 + v2[3]*xp3 + v2[2]*xp2 + v2[1]*xp1 + v2[0]*xp0;
465:       x3 += v3[4]*xp4 + v3[3]*xp3 + v3[2]*xp2 + v3[1]*xp1 + v3[0]*xp0;
466:       x4 += v4[4]*xp4 + v4[3]*xp3 + v4[2]*xp2 + v4[1]*xp1 + v4[0]*xp0;
467:     }
468:     xp = x + k*5;
469:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
470:   }
471:   return(0);
472: }

476: PetscErrorCode MatSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
477: {
478:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
479:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
480:   PetscScalar    *x,*b;

483:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
484:   MatScalar        *as=sbstrm->as;

487: #if 0 
488: #endif 
489:   VecGetArray(bb,&b);
490:   VecGetArray(xx,&x);

492:   /* solve U^T * D * y = b by forward substitution */
493:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
494:   MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);

496:   /* solve U*x = y by back substitution */
497:   MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);

499:   VecRestoreArray(bb,&b);
500:   VecRestoreArray(xx,&x);
501:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
502:   return(0);
503: }

507: PetscErrorCode MatForwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
508: {
509:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
510:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
511:   PetscScalar    *x,*b;

514:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
515:   MatScalar        *as=sbstrm->as;

518:   VecGetArray(bb,&b);
519:   VecGetArray(xx,&x);
520:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
521:   MatForwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);
522:   VecRestoreArray(bb,&b);
523:   VecRestoreArray(xx,&x);
524:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
525:   return(0);
526: }

530: PetscErrorCode MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
531: {
532:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
533:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
534:   PetscScalar    *x,*b;
536:   Mat_SeqSBSTRM    *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
537:   MatScalar        *as=sbstrm->as;

540:   VecGetArray(bb,&b);
541:   VecGetArray(xx,&x);
542:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
543:   MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering(ai,aj,as,mbs,x);
544:   VecRestoreArray(bb,&b);
545:   VecRestoreArray(xx,&x);
546:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
547:   return(0);
548: }


553: PetscErrorCode SeqSBSTRM_convertFact_sbstrm(Mat F)
554: {
555:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *) F->data;
556:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*) F->spptr;
557:   PetscInt       m = a->mbs, bs = F->rmap->bs;
558:   PetscInt       *ai = a->i;
559:   PetscInt       i,j,ib,jb;
560:   MatScalar      *aa = a->a, *aau, *asu;
562:   PetscInt  bs2, rbs,  cbs, blen, slen;
563:   PetscScalar **asp ;

566:   sbstrm->rbs = bs;
567:   sbstrm->cbs = bs;

569:   rbs = cbs = bs;
570:   bs2 = rbs*cbs;
571:   blen = ai[m]-ai[0];
572:   slen = blen*cbs;

574:   if(sbstrm->as) {
575:       PetscFree(sbstrm->as);
576:   }
577:   PetscMalloc(bs2*ai[m]*sizeof(MatScalar), &sbstrm->as);
578:   PetscMalloc(rbs*sizeof(MatScalar *), &asp);

580:   asu = sbstrm->as;
581:   for (i=0; i<m*bs2; i++) asu[i] = aa[i];

583:   asu = sbstrm->as + ai[0]*bs2;
584:   aau = aa         + ai[0]*bs2;

586:   for(i=0;i<rbs;i++) asp[i] = asu + i*slen;

588:   for(j=0;j<blen;j++) {
589:      for (jb=0; jb<cbs; jb++){
590:      for (ib=0; ib<rbs; ib++){
591:          asp[ib][j*cbs+jb] = aau[j*bs2+jb*rbs+ib];
592:      }}
593:   }

595:   switch (bs){
596:     case 4:
597:        F->ops->forwardsolve   = MatForwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
598:        F->ops->backwardsolve  = MatBackwardSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
599:        F->ops->solve          = MatSolve_SeqSBSTRM_4_NaturalOrdering_inplace;
600:        break;
601:     case 5:
602:        F->ops->forwardsolve   = MatForwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
603:        F->ops->backwardsolve  = MatBackwardSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
604:        F->ops->solve          = MatSolve_SeqSBSTRM_5_NaturalOrdering_inplace;
605:        break;
606:     default:
607:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
608:   }
609: #if 0
610: #endif

612:   PetscFree(asp);
613:   return(0);
614: }

616: /*=========================================================*/

618: EXTERN_C_BEGIN
621: PetscErrorCode MatFactorGetSolverPackage_seqsbaij_sbstrm(Mat A,const MatSolverPackage *type)
622: {
624:   *type = MATSOLVERSBSTRM;
625:   return(0);
626: }
627: EXTERN_C_END

631: PetscErrorCode MatCholeskyFactorNumeric_sbstrm(Mat F,Mat A,const MatFactorInfo *info)
632: {
633:   PetscInt          bs = A->rmap->bs;

637:   switch (bs){
638:     case 4:
639:        MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(F,A,info);
640:        break;
641:     case 5:
642:        MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(F,A,info);
643:        break;
644:     default:
645:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
646:   }
647: 
648:   SeqSBSTRM_convertFact_sbstrm(F);

650:   return(0);
651: }
652: /*=========================================================*/
655: PetscErrorCode MatICCFactorSymbolic_sbstrm(Mat B,Mat A,IS perm,const MatFactorInfo *info)
656: {
657:   PetscInt ierr;
659:   (MatICCFactorSymbolic_SeqSBAIJ)(B,A,perm,info);
660:   B->ops->choleskyfactornumeric  = MatCholeskyFactorNumeric_sbstrm;
661:   return(0);
662: }
663: /*=========================================================*/
666: PetscErrorCode MatCholeskyFactorSymbolic_sbstrm(Mat B,Mat A,IS perm,const MatFactorInfo *info)
667: {
668:   PetscInt ierr;
670:   (MatCholeskyFactorSymbolic_SeqSBAIJ)(B,A,perm,info);
671:   B->ops->choleskyfactornumeric  = MatCholeskyFactorNumeric_sbstrm;
672:   return(0);
673: }
674: /*=========================================================*/

676: EXTERN_C_BEGIN
679: PetscErrorCode MatGetFactor_seqsbaij_sbstrm(Mat A,MatFactorType ftype,Mat *F)
680: {
681:   Mat            B;
682:   PetscInt       bs = A->rmap->bs;
683:   Mat_SeqSBSTRM   *sbstrm;

687:   if (A->cmap->N != A->rmap->N) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
688:   MatCreate(((PetscObject)A)->comm,&B);
689:   MatSetSizes(B,A->rmap->n,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
690:   MatSetType(B,((PetscObject)A)->type_name);
691:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);


694:   B->ops->iccfactorsymbolic       = MatICCFactorSymbolic_sbstrm;
695:   B->ops->choleskyfactorsymbolic  = MatCholeskyFactorSymbolic_sbstrm;
696:   B->ops->choleskyfactornumeric   = MatCholeskyFactorNumeric_sbstrm;

698:   B->ops->destroy                 = MatDestroy_SeqSBSTRM;
699:   B->factortype                   = ftype;
700:   B->assembled                    = PETSC_TRUE;  /* required by -ksp_view */
701:   B->preallocated                 = PETSC_TRUE;

703:   PetscNewLog(B,Mat_SeqSBSTRM,&sbstrm);

705:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqsbaij_sbstrm",MatFactorGetSolverPackage_seqsbaij_sbstrm);

707:   B->spptr = sbstrm;
708:   *F = B;
709:   return(0);
710: }
711: EXTERN_C_END