Actual source code: sbstream.c

petsc-3.4.5 2014-06-29
  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 4:  #include ../src/mat/impls/sbaij/seq/sbstream/sbstream.h

  6: extern PetscErrorCode  MatAssemblyEnd_SeqSBAIJ(Mat,MatAssemblyType);


 11: PetscErrorCode MatDestroy_SeqSBSTRM(Mat A)
 12: {
 14:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*) A->spptr;

 16:   if (sbstrm) {
 17:     PetscFree3(sbstrm->as, sbstrm->asi, sbstrm->asj);
 18:   }
 19:   PetscObjectChangeTypeName((PetscObject)A, MATSEQSBAIJ);
 20:   MatDestroy_SeqSBAIJ(A);
 21:   return(0);
 22: }
 23: /*=========================================================*/
 24: PetscErrorCode MatDuplicate_SeqSBSTRM(Mat A, MatDuplicateOption op, Mat *M)
 25: {
 27:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Cannot duplicate STRM matrices yet");
 28:   return(0);
 29: }
 30: /*=========================================================*/

 34: PetscErrorCode SeqSBSTRM_convert_sbstrm(Mat A)
 35: {
 36:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*) A->data;
 37:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*) A->spptr;
 38:   PetscInt       m       = a->mbs, bs = A->rmap->bs;
 39:   PetscInt       *ai     = a->i;
 40:   PetscInt       i,j,ib,jb;
 41:   MatScalar      *aa = a->a;
 43:   PetscInt       bs2, rbs,  cbs, blen, slen;
 44:   PetscScalar    **asp;

 47:   sbstrm->rbs = bs;
 48:   sbstrm->cbs = bs;


 51:   rbs  = cbs = bs;
 52:   bs2  = rbs*cbs;
 53:   blen = ai[m]-ai[0];
 54:   slen = blen*cbs;

 56:   PetscFree(sbstrm->as);
 57:   PetscMalloc(bs2*blen*sizeof(MatScalar), &sbstrm->as);

 59:   PetscMalloc(rbs*sizeof(MatScalar*), &asp);

 61:   for (i=0; i<rbs; i++) asp[i] = sbstrm->as + i*slen;

 63:   for (j=0;j<blen;j++) {
 64:     for (jb=0; jb<cbs; jb++) {
 65:       for (ib=0; ib<rbs; ib++) asp[ib][j*cbs+jb] = aa[j*bs2+jb*rbs+ib];
 66:     }
 67:   }

 69:   PetscFree(asp);
 70: /*
 71:   switch (bs) {
 72:     case 4:
 73:        A->ops->solve   = MatSolve_SeqSBSTRM_4;
 74:        break;
 75:     case 5:
 76:        A->ops->solve   = MatSolve_SeqSBSTRM_5;
 77:        break;
 78:     default:
 79:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
 80:   }
 81: */
 82:   return(0);
 83: }
 84: /*=========================================================*/
 85: extern PetscErrorCode SeqSBSTRM_create_sbstrm(Mat);
 86: /*=========================================================*/
 89: PetscErrorCode MatAssemblyEnd_SeqSBSTRM(Mat A, MatAssemblyType mode)
 90: {

 94:   MatAssemblyEnd_SeqSBAIJ(A, mode);
 95:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

 97:   SeqSBSTRM_create_sbstrm(A);
 98:   return(0);
 99: }
100: /*=========================================================*/
103: PETSC_EXTERN PetscErrorCode MatConvert_SeqSBAIJ_SeqSBSTRM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
104: {

107:   Mat           B = *newmat;
108:   Mat_SeqSBSTRM *sbstrm;
109:   /* PetscInt       bs = A->rmap->bs; */

112:   if (reuse == MAT_INITIAL_MATRIX) {
113:     MatDuplicate(A,MAT_COPY_VALUES,&B);
114:   }


117:   PetscNewLog(B,Mat_SeqSBSTRM,&sbstrm);
118:   B->spptr = (void*) sbstrm;

120:   /* Set function pointers for methods that we inherit from BAIJ but override. */
121:   B->ops->duplicate   = MatDuplicate_SeqSBSTRM;
122:   B->ops->assemblyend = MatAssemblyEnd_SeqSBSTRM;
123:   B->ops->destroy     = MatDestroy_SeqSBSTRM;
124:   /*B->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_sbstrm;
125:     B->ops->choleskyfactornumeric  = MatCholeskyFactorNumeric_sbstrm; */

127:   /* If A has already been assembled, compute the permutation. */
128:   if (A->assembled) {
129:     SeqSBSTRM_create_sbstrm(B);
130:   }
131:   PetscObjectChangeTypeName((PetscObject)B,MATSEQSBSTRM);
132:   *newmat = B;
133:   return(0);
134: }

136: /*=========================================================*/
139: PetscErrorCode MatCreateSeqSBSTRM(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
140: {

144:   MatCreate(comm,A);
145:   MatSetSizes(*A,m,n,m,n);
146:   MatSetType(*A,MATSEQSBSTRM);
147:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(*A,bs,nz,(PetscInt*)nnz);
148:   (*A)->rmap->bs = bs;
149:   return(0);
150: }
151: /*=========================================================*/

155: PETSC_EXTERN PetscErrorCode MatCreate_SeqSBSTRM(Mat A)
156: {

160:   MatSetType(A,MATSEQSBAIJ);
161:   MatConvert_SeqSBAIJ_SeqSBSTRM(A,MATSEQSBSTRM,MAT_REUSE_MATRIX,&A);

163:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqisbaij_seqsbstrm_C",MatConvert_SeqSBAIJ_SeqSBSTRM);
164:   return(0);
165: }

167: /*=========================================================*/
168: /*=========================================================*/
171: PetscErrorCode MatMult_SeqSBSTRM_4(Mat A,Vec xx,Vec zz)
172: {
173:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*)A->data;
174:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
175:   PetscScalar    zero    = 0.0;
176:   PetscScalar    sum1,sum2,sum3,sum4,x1,x2,x3,x4, xr1,xr2,xr3,xr4;
177:   PetscScalar    *x, *xb, *z;
178:   MatScalar      *v1, *v2, *v3, *v4;
180:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
181:   PetscInt       nonzerorow=0;
182:   PetscInt       slen;

185:   VecSet(zz,zero);
186:   VecGetArray(xx,&x);
187:   VecGetArray(zz,&z);

189:   slen = 4*(ai[mbs]-ai[0]);
190:   v1   = sbstrm->as;
191:   v2   = v1 + slen;
192:   v3   = v2 + slen;
193:   v4   = v3 + slen;

195:   xb = x;

197:   for (i=0; i<mbs; i++) {
198:     n           = ai[i+1] - ai[i];
199:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
200:     sum1        = z[4*i]; sum2 = z[4*i+1]; sum3 = z[4*i+2]; sum4 = z[4*i+3];
201:     nonzerorow += (n>0);
202:     jmin        = 0;
203:     ib          = aj + ai[i];
204:     if (*ib == i) {     /* (diag of A)*x */
205:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
206:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
207:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4;
208:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
209:       v1   += 4; v2 += 4; v3 += 4; v4 += 4;
210:       jmin++;
211:     }

213:     for (j=jmin; j<n; j++) {
214:       cval       = ib[j]*4;
215:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
216:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
217:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
218:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;

220:       xr1   = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3];
221:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3  + v1[3]*xr4;
222:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3  + v2[3]*xr4;
223:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3  + v3[3]*xr4;
224:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3  + v4[3]*xr4;
225:       v1   += 4; v2 += 4; v3 += 4; v4 += 4;
226:     }
227:     z[4*i]   = sum1;
228:     z[4*i+1] = sum2;
229:     z[4*i+2] = sum3;
230:     z[4*i+3] = sum4;
231:     xb      +=4;

233:   }
234:   VecRestoreArray(xx,&x);
235:   VecRestoreArray(zz,&z);
236:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
237:   return(0);
238: }
239: /*=========================================================*/
242: PetscErrorCode MatMult_SeqSBSTRM_5(Mat A,Vec xx,Vec zz)
243: {
244:   Mat_SeqSBAIJ    *a      = (Mat_SeqSBAIJ*)A->data;
245:   Mat_SeqSBSTRM   *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
246:   PetscScalar     zero    = 0.0;
247:   PetscScalar     *z      = 0;
248:   PetscScalar     *x,*xb;
249:   const MatScalar *v1, *v2, *v3, *v4, *v5;
250:   PetscScalar     x1, x2, x3, x4, x5;
251:   PetscScalar     xr1, xr2, xr3, xr4, xr5;
252:   PetscScalar     sum1, sum2, sum3, sum4, sum5;
253:   PetscErrorCode  ierr;
254:   PetscInt        mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
255:   PetscInt        nonzerorow=0;
256:   PetscInt        slen;

259:   VecSet(zz,zero);
260:   VecGetArray(xx,&x);
261:   VecGetArray(zz,&z);

263:   slen = 5*(ai[mbs]-ai[0]);

265:   v1 = sbstrm->as;
266:   v2 = v1 + slen;
267:   v3 = v2 + slen;
268:   v4 = v3 + slen;
269:   v5 = v4 + slen;

271:   xb = x;

273:   for (i=0; i<mbs; i++) {
274:     n  = ai[i+1] - ai[i];

276:     nonzerorow += (n>0);

278:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];

280:     sum1 = z[5*i];
281:     sum2 = z[5*i+1];
282:     sum3 = z[5*i+2];
283:     sum4 = z[5*i+3];
284:     sum5 = z[5*i+4];
285:     jmin = 0;
286:     ib   = aj + ai[i];
287:     if (*ib == i) {     /* (diag of A)*x */
288:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
289:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
290:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
291:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v4[4]*x5;
292:       sum5 += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

294:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
295:       jmin++;
296:     }

298:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
299:     PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
300:     PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
301:     PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
302:     PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
303:     PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */

305:     for (j=jmin; j<n; j++) {
306:       cval       = ib[j]*5;
307:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
308:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
309:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
310:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
311:       z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

313:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3]; xr5 = x[cval+4];

315:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3 + v1[3]*xr4 + v1[4]*xr5;
316:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3 + v2[3]*xr4 + v2[4]*xr5;
317:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3 + v3[3]*xr4 + v3[4]*xr5;
318:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3 + v4[3]*xr4 + v4[4]*xr5;
319:       sum5 += v5[0]*xr1 + v5[1]*xr2 + v5[2]*xr3 + v5[3]*xr4 + v5[4]*xr5;

321:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
322:     }
323:     z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; z[5*i+3] = sum4; z[5*i+4] = sum5;
324:     xb    += 5;
325:   }
326:   VecRestoreArray(xx,&x);
327:   VecRestoreArray(zz,&z);
328:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
329:   return(0);
330: }
331: /*=========================================================*/
334: PetscErrorCode MatMultAdd_SeqSBSTRM_4(Mat A,Vec xx,Vec yy,Vec zz)
335: {

337:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*)A->data;
338:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
339:   PetscScalar    sum1,sum2,sum3,sum4,x1,x2,x3,x4, xr1,xr2,xr3,xr4;
340:   PetscScalar    *x,*z, *xb;
341:   MatScalar      *v1, *v2, *v3, *v4;
343:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
344:   PetscInt       nonzerorow=0;
345:   PetscInt       slen;

348:   VecCopy(yy,zz);
349:   VecGetArray(xx,&x);
350:   VecGetArray(zz,&z);

352:   slen = 4*(ai[mbs]-ai[0]);
353:   v1   = sbstrm->as;
354:   v2   = v1 + slen;
355:   v3   = v2 + slen;
356:   v4   = v3 + slen;

358:   xb = x;

360:   for (i=0; i<mbs; i++) {
361:     n  = ai[i+1] - ai[i];
362:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];  xb += 4;

364:     sum1 = z[4*i];
365:     sum2 = z[4*i + 1];
366:     sum3 = z[4*i + 2];
367:     sum4 = z[4*i + 3];

369:     nonzerorow += (n>0);

371:     jmin = 0;
372:     ib   = aj + ai[i];
373:     if (*ib == i) {     /* (diag of A)*x */
374:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
375:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
376:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4;
377:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;

379:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
380:       jmin++;
381:     }

383:     for (j=jmin; j<n; j++) {
384:       cval       = ib[j]*4;
385:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
386:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
387:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
388:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;

390:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3];

392:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3  + v1[3]*xr4;
393:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3  + v2[3]*xr4;
394:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3  + v3[3]*xr4;
395:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3  + v4[3]*xr4;

397:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
398:     }
399:     z[4*i]   = sum1;
400:     z[4*i+1] = sum2;
401:     z[4*i+2] = sum3;
402:     z[4*i+3] = sum4;

404:   }
405:   VecRestoreArray(xx,&x);
406:   VecRestoreArray(zz,&z);
407:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
408:   return(0);
409: }

411: /*=========================================================*/
414: PetscErrorCode MatMultAdd_SeqSBSTRM_5(Mat A,Vec xx,Vec yy,Vec zz)
415: {
416:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*)A->data;
417:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*)A->spptr;
418:   PetscScalar    *x,*xb, *z;
419:   MatScalar      *v1, *v2, *v3, *v4, *v5;
420:   PetscScalar    x1, x2, x3, x4, x5;
421:   PetscScalar    xr1, xr2, xr3, xr4, xr5;
422:   PetscScalar    sum1, sum2, sum3, sum4, sum5;
424:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
425:   PetscInt       nonzerorow=0;
426:   PetscInt       slen;

429:   VecCopy(yy,zz);
430:   VecGetArray(xx,&x);
431:   VecGetArray(zz,&z);


434:   slen = 5*(ai[mbs]-ai[0]);
435:   v1   = sbstrm->as;
436:   v2   = v1 + slen;
437:   v3   = v2 + slen;
438:   v4   = v3 + slen;
439:   v5   = v4 + slen;

441:   xb = x;

443:   for (i=0; i<mbs; i++) {
444:     n  = ai[i+1] - ai[i];
445:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; xb += 5;

447:     sum1 = z[5*i];
448:     sum2 = z[5*i+1];
449:     sum3 = z[5*i+2];
450:     sum4 = z[5*i+3];
451:     sum5 = z[5*i+4];

453:     nonzerorow += (n>0);

455:     jmin = 0;
456:     ib   = aj + ai[i];
457:     if (*ib == i) {     /* (diag of A)*x, only upper triangular part is used */
458:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
459:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
460:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
461:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v4[4]*x5;
462:       sum5 += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

464:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
465:       jmin++;
466:     }

468:     for (j=jmin; j<n; j++) {
469:       cval       = ib[j]*5;
470:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
471:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
472:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
473:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
474:       z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

476:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3]; xr5 = x[cval+4];

478:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3 + v1[3]*xr4 + v1[4]*xr5;
479:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3 + v2[3]*xr4 + v2[4]*xr5;
480:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3 + v3[3]*xr4 + v3[4]*xr5;
481:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3 + v4[3]*xr4 + v4[4]*xr5;
482:       sum5 += v5[0]*xr1 + v5[1]*xr2 + v5[2]*xr3 + v5[3]*xr4 + v5[4]*xr5;

484:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
485:     }
486:     z[5*i]   = sum1;
487:     z[5*i+1] = sum2;
488:     z[5*i+2] = sum3;
489:     z[5*i+3] = sum4;
490:     z[5*i+4] = sum5;
491:   }
492:   VecRestoreArray(xx,&x);
493:   VecRestoreArray(zz,&z);
494:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
495:   return(0);
496: }

498: /*=========================================================*/
501: PetscErrorCode SeqSBSTRM_create_sbstrm(Mat A)
502: {
503:   Mat_SeqSBAIJ   *a      = (Mat_SeqSBAIJ*) A->data;
504:   Mat_SeqSBSTRM  *sbstrm = (Mat_SeqSBSTRM*) A->spptr;
505:   PetscInt       MROW    = a->mbs, bs = A->rmap->bs;
506:   PetscInt       *ai     = a->i;
507:   PetscInt       i,j,k;
508:   MatScalar      *aa = a->a;
510:   PetscInt       bs2, rbs,  cbs, blen, slen;
511:   PetscScalar    **asp;

514:   sbstrm->rbs = sbstrm->cbs = bs;

516:   rbs  = cbs = bs;
517:   bs2  = rbs*cbs;
518:   blen = ai[MROW]-ai[0];
519:   slen = blen*cbs;

521:   PetscMalloc(bs2*blen*sizeof(PetscScalar), &sbstrm->as);

523:   PetscMalloc(rbs*sizeof(PetscScalar*), &asp);

525:   for (i=0; i<rbs; i++) asp[i] = sbstrm->as + i*slen;

527:   for (k=0; k<blen; k++) {
528:     for (j=0; j<cbs; j++) {
529:       for (i=0; i<rbs; i++) asp[i][k*cbs+j] = aa[k*bs2+j*rbs+i];
530:     }
531:   }

533:   /* PetscFree(a->a);  */

535:   switch (bs) {
536:   case 4:
537:     A->ops->mult    = MatMult_SeqSBSTRM_4;
538:     A->ops->multadd = MatMultAdd_SeqSBSTRM_4;
539:     /** A->ops->sor     = MatSOR_SeqSBSTRM_4;  **/
540:     break;
541:   case 5:
542:     A->ops->mult    = MatMult_SeqSBSTRM_5;
543:     A->ops->multadd = MatMultAdd_SeqSBSTRM_5;
544:     /** A->ops->sor     = MatSOR_SeqSBSTRM_5;  **/
545:     break;
546:   default:
547:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
548:   }
549:   return(0);
550: }
551: /*=========================================================*/

555: PetscErrorCode MatSeqSBSTRMTransform(Mat A)
556: {
558:   SeqSBSTRM_convert_sbstrm(A);
559:   return(0);
560: }