Actual source code: sbstream.c

petsc-3.3-p7 2013-05-11
  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 4:  #include ../src/mat/impls/sbaij/seq/sbstream/sbstream.h

  6: #if 0 
  7: extern   PetscErrorCode MatFactorGetSolverPackage_seqsbaij_sbstrm(Mat, const MatSolverPackage *);
  8: extern   PetscErrorCode MatICCFactorSymbolic_sbstrm(Mat,Mat, IS, const MatFactorInfo *);
  9: extern   PetscErrorCode MatCholeskyFactorSymbolic_sbstrm(Mat, Mat, IS, const MatFactorInfo *);
 10: extern   PetscErrorCode MatCholeskyFactorNumeric_sbstrm (Mat, Mat, const MatFactorInfo *);
 11: #endif 

 13: extern   PetscErrorCode  MatAssemblyEnd_SeqSBAIJ(Mat,MatAssemblyType);


 18: PetscErrorCode MatDestroy_SeqSBSTRM(Mat A)
 19: {
 21:   Mat_SeqSBSTRM       *sbstrm = (Mat_SeqSBSTRM *) A->spptr;

 23:   if (sbstrm) {
 24:     PetscFree3(sbstrm->as, sbstrm->asi, sbstrm->asj);
 25:   }
 26:   PetscObjectChangeTypeName( (PetscObject)A, MATSEQSBAIJ);
 27:   MatDestroy_SeqSBAIJ(A);
 28:   return(0);
 29: }
 30: /*=========================================================*/
 31: PetscErrorCode MatDuplicate_SeqSBSTRM(Mat A, MatDuplicateOption op, Mat *M)
 32: {
 34:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Cannot duplicate STRM matrices yet");
 35:   return(0);
 36: }
 37: /*=========================================================*/

 41: PetscErrorCode SeqSBSTRM_convert_sbstrm(Mat A)
 42: {
 43:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *) A->data;
 44:   Mat_SeqSBSTRM   *sbstrm = (Mat_SeqSBSTRM*) A->spptr;
 45:   PetscInt       m = a->mbs, bs = A->rmap->bs;
 46:   PetscInt       *ai = a->i;
 47:   PetscInt       i,j,ib,jb;
 48:   MatScalar      *aa = a->a;
 50:   PetscInt  bs2, rbs,  cbs, blen, slen;
 51:   PetscScalar **asp;

 54:   sbstrm->rbs = bs;
 55:   sbstrm->cbs = bs;


 58:   rbs = cbs = bs;
 59:   bs2 = rbs*cbs;
 60:   blen = ai[m]-ai[0];
 61:   slen = blen*cbs;

 63:   PetscFree(sbstrm->as);
 64:   PetscMalloc(bs2*blen*sizeof(MatScalar), &sbstrm->as);

 66:   PetscMalloc(rbs*sizeof(MatScalar *), &asp);
 67: 
 68:   for(i=0;i<rbs;i++) asp[i] = sbstrm->as + i*slen;

 70:   for(j=0;j<blen;j++) {
 71:      for (jb=0; jb<cbs; jb++){
 72:      for (ib=0; ib<rbs; ib++){
 73:          asp[ib][j*cbs+jb] = aa[j*bs2+jb*rbs+ib];
 74:      }}
 75:   }

 77:   PetscFree(asp);
 78: /*
 79:   switch (bs){
 80:     case 4:
 81:        A->ops->solve   = MatSolve_SeqSBSTRM_4;
 82:        break; 
 83:     case 5:
 84:        A->ops->solve   = MatSolve_SeqSBSTRM_5;
 85:        break;  
 86:     default:
 87:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
 88:   }
 89: */
 90:   return(0);
 91: }
 92: /*=========================================================*/
 93: extern PetscErrorCode SeqSBSTRM_create_sbstrm(Mat);
 94: /*=========================================================*/
 97: PetscErrorCode MatAssemblyEnd_SeqSBSTRM(Mat A, MatAssemblyType mode)
 98: {

102:   MatAssemblyEnd_SeqSBAIJ(A, mode);
103:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

105:   SeqSBSTRM_create_sbstrm(A);
106:   return(0);
107: }
108: /*=========================================================*/
109: EXTERN_C_BEGIN
112: PetscErrorCode MatConvert_SeqSBAIJ_SeqSBSTRM(Mat A,const MatType type,MatReuse reuse,Mat *newmat)
113: {

116:   Mat            B = *newmat;
117:   Mat_SeqSBSTRM   *sbstrm;
118:   /* PetscInt       bs = A->rmap->bs; */

121:   if (reuse == MAT_INITIAL_MATRIX) {
122:     MatDuplicate(A,MAT_COPY_VALUES,&B);
123:   }
124: 

126:   PetscNewLog(B,Mat_SeqSBSTRM,&sbstrm);
127:   B->spptr = (void *) sbstrm;

129:   /* Set function pointers for methods that we inherit from BAIJ but override. */
130:   B->ops->duplicate        = MatDuplicate_SeqSBSTRM;
131:   B->ops->assemblyend      = MatAssemblyEnd_SeqSBSTRM;
132:   B->ops->destroy          = MatDestroy_SeqSBSTRM;
133:   /*B->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_sbstrm;
134:     B->ops->choleskyfactornumeric  = MatCholeskyFactorNumeric_sbstrm; */

136:   /* If A has already been assembled, compute the permutation. */
137:   if (A->assembled) {
138:       SeqSBSTRM_create_sbstrm(B);
139:   }
140:   PetscObjectChangeTypeName((PetscObject)B,MATSEQSBSTRM);
141:   *newmat = B;
142:   return(0);
143: }
144: EXTERN_C_END

146: /*=========================================================*/
149: PetscErrorCode MatCreateSeqSBSTRM(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
150: {
153:     MatCreate(comm,A);
154:     MatSetSizes(*A,m,n,m,n);
155:     MatSetType(*A,MATSEQSBSTRM);
156:     MatSeqSBAIJSetPreallocation_SeqSBAIJ(*A,bs,nz,(PetscInt*)nnz);
157:     (*A)->rmap->bs = bs;
158:   return(0);
159: }
160: /*=========================================================*/
161: EXTERN_C_BEGIN
164: PetscErrorCode  MatCreate_SeqSBSTRM(Mat A)
165: {

169:   MatSetType(A,MATSEQSBAIJ);
170:   MatConvert_SeqSBAIJ_SeqSBSTRM(A,MATSEQSBSTRM,MAT_REUSE_MATRIX,&A);

172:   PetscObjectComposeFunctionDynamic((PetscObject)A,"MatConvert_seqisbaij_seqsbstrm_C",
173:                                      "MatConvert_SeqSBAIJ_SeqSBSTRM",
174:                                       MatConvert_SeqSBAIJ_SeqSBSTRM);
175:   return(0);
176: }
177: EXTERN_C_END
178: /*=========================================================*/
179: /*=========================================================*/
182: PetscErrorCode MatMult_SeqSBSTRM_4(Mat A,Vec xx,Vec zz)
183: {
184:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
185:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
186:   PetscScalar       zero = 0.0;
187:   PetscScalar       sum1,sum2,sum3,sum4,x1,x2,x3,x4, xr1,xr2,xr3,xr4;
188:   PetscScalar       *x, *xb, *z;
189:   MatScalar         *v1, *v2, *v3, *v4;
190:   PetscErrorCode    ierr;
191:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
192:   PetscInt       nonzerorow=0;
193:   PetscInt slen;

196:   VecSet(zz,zero);
197:   VecGetArray(xx,&x);
198:   VecGetArray(zz,&z);

200:   slen = 4*(ai[mbs]-ai[0]);
201:   v1 = sbstrm->as;
202:   v2 = v1 + slen;
203:   v3 = v2 + slen;
204:   v4 = v3 + slen;

206:   xb = x;

208:   for (i=0; i<mbs; i++) {
209:     n  = ai[i+1] - ai[i];
210:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
211:     sum1 = z[4*i]; sum2 = z[4*i+1]; sum3 = z[4*i+2]; sum4 = z[4*i+3];
212:     nonzerorow += (n>0);
213:     jmin = 0;
214:     ib = aj + ai[i];
215:     if (*ib == i){     /* (diag of A)*x */
216:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
217:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
218:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4;
219:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
220:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
221:       jmin++;
222:     }
223: 
224:     for (j=jmin; j<n; j++) {
225:       cval       = ib[j]*4;
226:       z[cval]     += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
227:       z[cval+1]   += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
228:       z[cval+2]   += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
229:       z[cval+3]   += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;

231:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3];
232:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3  + v1[3]*xr4;
233:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3  + v2[3]*xr4;
234:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3  + v3[3]*xr4;
235:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3  + v4[3]*xr4;
236:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
237:     }
238:     z[4*i]   = sum1;
239:     z[4*i+1] = sum2;
240:     z[4*i+2] = sum3;
241:     z[4*i+3] = sum4;
242:     xb +=4;
243: 
244:   }
245:   VecRestoreArray(xx,&x);
246:   VecRestoreArray(zz,&z);
247:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
248:   return(0);
249: }
250: /*=========================================================*/
253: PetscErrorCode MatMult_SeqSBSTRM_5(Mat A,Vec xx,Vec zz)
254: {
255:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
256:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
257:   PetscScalar       zero = 0.0;
258:   PetscScalar       *z = 0;
259:   PetscScalar       *x,*xb;
260:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
261:   PetscScalar       x1, x2, x3, x4, x5;
262:   PetscScalar       xr1, xr2, xr3, xr4, xr5;
263:   PetscScalar       sum1, sum2, sum3, sum4, sum5;
264:   PetscErrorCode    ierr;
265:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
266:   PetscInt       nonzerorow=0;
267:   PetscInt slen;


271:   VecSet(zz,zero);
272:   VecGetArray(xx,&x);
273:   VecGetArray(zz,&z);

275:   slen = 5*(ai[mbs]-ai[0]);

277:   v1 = sbstrm->as;
278:   v2 = v1 + slen;
279:   v3 = v2 + slen;
280:   v4 = v3 + slen;
281:   v5 = v4 + slen;

283:   xb = x;

285:   for (i=0; i<mbs; i++) {
286:     n  = ai[i+1] - ai[i];
287:     nonzerorow += (n>0);

289:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
290:     sum1 = z[5*i];
291:     sum2 = z[5*i+1];
292:     sum3 = z[5*i+2];
293:     sum4 = z[5*i+3];
294:     sum5 = z[5*i+4];
295:     jmin = 0;
296:     ib = aj + ai[i];
297:     if (*ib == i){     /* (diag of A)*x */
298:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
299:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
300:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
301:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v4[4]*x5;
302:       sum5 += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;
303:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
304:       jmin++;
305:     }
306: 
307:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
308:     PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
309:     PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310:     PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311:     PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
312:     PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */

314:     for (j=jmin; j<n; j++) {
315:       cval       = ib[j]*5;
316:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
317:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
318:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
319:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
320:       z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

322:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3]; xr5 = x[cval+4];
323:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3 + v1[3]*xr4 + v1[4]*xr5;
324:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3 + v2[3]*xr4 + v2[4]*xr5;
325:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3 + v3[3]*xr4 + v3[4]*xr5;
326:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3 + v4[3]*xr4 + v4[4]*xr5;
327:       sum5 += v5[0]*xr1 + v5[1]*xr2 + v5[2]*xr3 + v5[3]*xr4 + v5[4]*xr5;
328:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
329:     }
330:     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3; z[5*i+3] = sum4; z[5*i+4] = sum5;
331:     xb += 5;
332:   }
333:   VecRestoreArray(xx,&x);
334:   VecRestoreArray(zz,&z);
335:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
336:   return(0);
337: }
338: /*=========================================================*/
341: PetscErrorCode MatMultAdd_SeqSBSTRM_4(Mat A,Vec xx,Vec yy,Vec zz)
342: {

344:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
345:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
346:   PetscScalar       sum1,sum2,sum3,sum4,x1,x2,x3,x4, xr1,xr2,xr3,xr4;
347:   PetscScalar       *x,*z, *xb;
348:   MatScalar         *v1, *v2, *v3, *v4;
349:   PetscErrorCode    ierr;
350:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
351:   PetscInt       nonzerorow=0;
352:   PetscInt slen;

355:   VecCopy(yy,zz);
356:   VecGetArray(xx,&x);
357:   VecGetArray(zz,&z);

359:   slen = 4*(ai[mbs]-ai[0]);
360:   v1 = sbstrm->as;
361:   v2 = v1 + slen;
362:   v3 = v2 + slen;
363:   v4 = v3 + slen;

365:   xb = x;

367:   for (i=0; i<mbs; i++) {
368:     n  = ai[i+1] - ai[i];
369:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];  xb += 4;
370:     sum1 = z[4*i];
371:     sum2 = z[4*i + 1];
372:     sum3 = z[4*i + 2];
373:     sum4 = z[4*i + 3];
374:     nonzerorow += (n>0);
375:     jmin = 0;
376:     ib = aj + ai[i];
377:     if (*ib == i){     /* (diag of A)*x */
378:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
379:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
380:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4;
381:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
382:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
383:       jmin++;
384:     }
385: 
386:     for (j=jmin; j<n; j++) {
387:       cval       = ib[j]*4;
388:       z[cval]     += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
389:       z[cval+1]   += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
390:       z[cval+2]   += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
391:       z[cval+3]   += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;

393:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3];
394:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3  + v1[3]*xr4;
395:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3  + v2[3]*xr4;
396:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3  + v3[3]*xr4;
397:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3  + v4[3]*xr4;
398:       v1 += 4; v2 += 4; v3 += 4; v4 += 4;
399:     }
400:     z[4*i]   = sum1;
401:     z[4*i+1] = sum2;
402:     z[4*i+2] = sum3;
403:     z[4*i+3] = sum4;
404: 
405:   }
406:   VecRestoreArray(xx,&x);
407:   VecRestoreArray(zz,&z);
408:    PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
409:   return(0);
410: }

412: /*=========================================================*/
415: PetscErrorCode MatMultAdd_SeqSBSTRM_5(Mat A,Vec xx,Vec yy,Vec zz)
416: {
417:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
418:   Mat_SeqSBSTRM      *sbstrm = (Mat_SeqSBSTRM *)A->spptr;
419:   PetscScalar       *x,*xb, *z;
420:   MatScalar         *v1, *v2, *v3, *v4, *v5;
421:   PetscScalar       x1, x2, x3, x4, x5;
422:   PetscScalar       xr1, xr2, xr3, xr4, xr5;
423:   PetscScalar       sum1, sum2, sum3, sum4, sum5;
424:   PetscErrorCode    ierr;
425:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
426:   PetscInt       nonzerorow=0;
427:   PetscInt slen;

430:   VecCopy(yy,zz);
431:   VecGetArray(xx,&x);
432:   VecGetArray(zz,&z);


435:   slen = 5*(ai[mbs]-ai[0]);
436:   v1 = sbstrm->as;
437:   v2 = v1 + slen;
438:   v3 = v2 + slen;
439:   v4 = v3 + slen;
440:   v5 = v4 + slen;

442:   xb = x;

444:   for (i=0; i<mbs; i++) {
445:     n  = ai[i+1] - ai[i];
446:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; xb += 5;
447:     sum1 = z[5*i];
448:     sum2 = z[5*i+1];
449:     sum3 = z[5*i+2];
450:     sum4 = z[5*i+3];
451:     sum5 = z[5*i+4];
452:     nonzerorow += (n>0);
453:     jmin = 0;
454:     ib = aj + ai[i];
455:     if (*ib == i){     /* (diag of A)*x, only upper triangular part is used */
456:       sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
457:       sum2 += v1[1]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
458:       sum3 += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
459:       sum4 += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v4[4]*x5;
460:       sum5 += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;
461:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
462:       jmin++;
463:     }
464: 
465:     for (j=jmin; j<n; j++) {
466:       cval       = ib[j]*5;
467:       z[cval]   += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
468:       z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
469:       z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
470:       z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
471:       z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;

473:       xr1 = x[cval]; xr2 = x[cval+1]; xr3 = x[cval+2]; xr4 = x[cval+3]; xr5 = x[cval+4];
474:       sum1 += v1[0]*xr1 + v1[1]*xr2 + v1[2]*xr3 + v1[3]*xr4 + v1[4]*xr5;
475:       sum2 += v2[0]*xr1 + v2[1]*xr2 + v2[2]*xr3 + v2[3]*xr4 + v2[4]*xr5;
476:       sum3 += v3[0]*xr1 + v3[1]*xr2 + v3[2]*xr3 + v3[3]*xr4 + v3[4]*xr5;
477:       sum4 += v4[0]*xr1 + v4[1]*xr2 + v4[2]*xr3 + v4[3]*xr4 + v4[4]*xr5;
478:       sum5 += v5[0]*xr1 + v5[1]*xr2 + v5[2]*xr3 + v5[3]*xr4 + v5[4]*xr5;
479:       v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
480:     }
481:     z[5*i]   = sum1;
482:     z[5*i+1] = sum2;
483:     z[5*i+2] = sum3;
484:     z[5*i+3] = sum4;
485:     z[5*i+4] = sum5;
486:   }
487:   VecRestoreArray(xx,&x);
488:   VecRestoreArray(zz,&z);
489:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
490:   return(0);
491: }

493: /*=========================================================*/
496: PetscErrorCode SeqSBSTRM_create_sbstrm(Mat A)
497: {
498:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *) A->data;
499:   Mat_SeqSBSTRM   *sbstrm = (Mat_SeqSBSTRM*) A->spptr;
500:   PetscInt       MROW = a->mbs, bs = A->rmap->bs;
501:   PetscInt       *ai = a->i;
502:   PetscInt       i,j,k;
503:   MatScalar      *aa = a->a;
505:   PetscInt  bs2, rbs,  cbs, blen, slen;
506:   PetscScalar **asp;

509:   sbstrm->rbs = sbstrm->cbs = bs;

511:   rbs = cbs = bs;
512:   bs2 = rbs*cbs;
513:   blen = ai[MROW]-ai[0];
514:   slen = blen*cbs;

516:   PetscMalloc(bs2*blen*sizeof(PetscScalar), &sbstrm->as);

518:   PetscMalloc(rbs*sizeof(PetscScalar *), &asp);
519: 
520:   for(i=0;i<rbs;i++) asp[i] = sbstrm->as + i*slen;
521: 
522:   for (k=0; k<blen; k++) {
523:     for (j=0; j<cbs; j++)
524:     for (i=0; i<rbs; i++)
525:         asp[i][k*cbs+j] = aa[k*bs2+j*rbs+i];
526:   }

528:   /* PetscFree(a->a);  */

530:   switch (bs){
531:     case 4:
532:        A->ops->mult          = MatMult_SeqSBSTRM_4;
533:        A->ops->multadd       = MatMultAdd_SeqSBSTRM_4;
534:        /** A->ops->sor     = MatSOR_SeqSBSTRM_4;  **/
535:        break;
536:     case 5:
537:        A->ops->mult          = MatMult_SeqSBSTRM_5;
538:        A->ops->multadd       = MatMultAdd_SeqSBSTRM_5;
539:        /** A->ops->sor     = MatSOR_SeqSBSTRM_5;  **/
540:        break;
541:     default:
542:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
543:   }
544:   return(0);
545: }
546: /*=========================================================*/
547: EXTERN_C_BEGIN
550: PetscErrorCode MatSeqSBSTRMTransform(Mat A)
551: {
553:     SeqSBSTRM_convert_sbstrm(A);
554:   return(0);
555: }
556: EXTERN_C_END