Actual source code: baij2.c

petsc-3.4.5 2014-06-29
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <petsc-private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 10: {
 11:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 13:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 14:   const PetscInt *idx;
 15:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT        table;

 19:   m  = a->mbs;
 20:   ai = a->i;
 21:   aj = a->j;
 22:   bs = A->rmap->bs;

 24:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,&table);
 27:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 28:   PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);

 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n; ++j) {
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(&is[i]);

 48:     k = 0;
 49:     for (j=0; j<ov; j++) { /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end; l++) {
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 66:   }
 67:   PetscBTDestroy(&table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return(0);
 71: }

 75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 76: {
 77:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 79:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 80:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 81:   const PetscInt *irow,*icol;
 82:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscBool      flag,sorted;

 89:   ISSorted(iscol,&sorted);
 90:   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 92:   ISGetIndices(isrow,&irow);
 93:   ISGetIndices(iscol,&icol);
 94:   ISGetLocalSize(isrow,&nrows);
 95:   ISGetLocalSize(iscol,&ncols);

 97:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 98:   ssmap = smap;
 99:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
101:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102:   /* determine lens of each row */
103:   for (i=0; i<nrows; i++) {
104:     kstart  = ai[irow[i]];
105:     kend    = kstart + a->ilen[irow[i]];
106:     lens[i] = 0;
107:     for (k=kstart; k<kend; k++) {
108:       if (ssmap[aj[k]]) lens[i]++;
109:     }
110:   }
111:   /* Create and fill new matrix */
112:   if (scall == MAT_REUSE_MATRIX) {
113:     c = (Mat_SeqBAIJ*)((*B)->data);

115:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
117:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
119:     C    = *B;
120:   } else {
121:     MatCreate(PetscObjectComm((PetscObject)A),&C);
122:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
123:     MatSetType(C,((PetscObject)A)->type_name);
124:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
125:   }
126:   c = (Mat_SeqBAIJ*)(C->data);
127:   for (i=0; i<nrows; i++) {
128:     row      = irow[i];
129:     kstart   = ai[row];
130:     kend     = kstart + a->ilen[row];
131:     mat_i    = c->i[i];
132:     mat_j    = c->j + mat_i;
133:     mat_a    = c->a + mat_i*bs2;
134:     mat_ilen = c->ilen + i;
135:     for (k=kstart; k<kend; k++) {
136:       if ((tcol=ssmap[a->j[k]])) {
137:         *mat_j++ = tcol - 1;
138:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
139:         mat_a   += bs2;
140:         (*mat_ilen)++;
141:       }
142:     }
143:   }

145:   /* Free work space */
146:   ISRestoreIndices(iscol,&icol);
147:   PetscFree(smap);
148:   PetscFree(lens);
149:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
150:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

152:   ISRestoreIndices(isrow,&irow);
153:   *B   = C;
154:   return(0);
155: }

159: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
160: {
161:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
162:   IS             is1,is2;
164:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
165:   const PetscInt *irow,*icol;

168:   ISGetIndices(isrow,&irow);
169:   ISGetIndices(iscol,&icol);
170:   ISGetLocalSize(isrow,&nrows);
171:   ISGetLocalSize(iscol,&ncols);

173:   /* Verify if the indices corespond to each element in a block
174:    and form the IS with compressed IS */
175:   PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
176:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
177:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178:   count = 0;
179:   for (i=0; i<a->mbs; i++) {
180:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
181:     if (vary[i]==bs) iary[count++] = i;
182:   }
183:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

185:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
186:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187:   count = 0;
188:   for (i=0; i<a->mbs; i++) {
189:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
190:     if (vary[i]==bs) iary[count++] = i;
191:   }
192:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
193:   ISRestoreIndices(isrow,&irow);
194:   ISRestoreIndices(iscol,&icol);
195:   PetscFree2(vary,iary);

197:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
198:   ISDestroy(&is1);
199:   ISDestroy(&is2);
200:   return(0);
201: }

205: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206: {
208:   PetscInt       i;

211:   if (scall == MAT_INITIAL_MATRIX) {
212:     PetscMalloc((n+1)*sizeof(Mat),B);
213:   }

215:   for (i=0; i<n; i++) {
216:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
217:   }
218:   return(0);
219: }


222: /* -------------------------------------------------------*/
223: /* Should check that shapes of vectors and matrices match */
224: /* -------------------------------------------------------*/

228: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229: {
230:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
231:   PetscScalar       *z,sum;
232:   const PetscScalar *x;
233:   const MatScalar   *v;
234:   PetscErrorCode    ierr;
235:   PetscInt          mbs,i,n,nonzerorow=0;
236:   const PetscInt    *idx,*ii,*ridx=NULL;
237:   PetscBool         usecprow=a->compressedrow.use;

240:   VecGetArrayRead(xx,&x);
241:   VecGetArray(zz,&z);

243:   if (usecprow) {
244:     mbs  = a->compressedrow.nrows;
245:     ii   = a->compressedrow.i;
246:     ridx = a->compressedrow.rindex;
247:     PetscMemzero(z,mbs*sizeof(PetscScalar));
248:   } else {
249:     mbs = a->mbs;
250:     ii  = a->i;
251:   }

253:   for (i=0; i<mbs; i++) {
254:     n   = ii[1] - ii[0];
255:     v   = a->a + ii[0];
256:     idx = a->j + ii[0];
257:     ii++;
258:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
259:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
260:     sum = 0.0;
261:     PetscSparseDensePlusDot(sum,x,v,idx,n);
262:     if (usecprow) {
263:       z[ridx[i]] = sum;
264:     } else {
265:       nonzerorow += (n>0);
266:       z[i]        = sum;
267:     }
268:   }
269:   VecRestoreArrayRead(xx,&x);
270:   VecRestoreArray(zz,&z);
271:   PetscLogFlops(2.0*a->nz - nonzerorow);
272:   return(0);
273: }

277: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
280:   PetscScalar       *z = 0,sum1,sum2,*zarray;
281:   const PetscScalar *x,*xb;
282:   PetscScalar       x1,x2;
283:   const MatScalar   *v;
284:   PetscErrorCode    ierr;
285:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
286:   PetscBool         usecprow=a->compressedrow.use;

289:   VecGetArrayRead(xx,&x);
290:   VecGetArray(zz,&zarray);

292:   idx = a->j;
293:   v   = a->a;
294:   if (usecprow) {
295:     mbs  = a->compressedrow.nrows;
296:     ii   = a->compressedrow.i;
297:     ridx = a->compressedrow.rindex;
298:   } else {
299:     mbs = a->mbs;
300:     ii  = a->i;
301:     z   = zarray;
302:   }

304:   for (i=0; i<mbs; i++) {
305:     n           = ii[1] - ii[0]; ii++;
306:     sum1        = 0.0; sum2 = 0.0;
307:     nonzerorow += (n>0);
308:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
309:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310:     for (j=0; j<n; j++) {
311:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312:       sum1 += v[0]*x1 + v[2]*x2;
313:       sum2 += v[1]*x1 + v[3]*x2;
314:       v    += 4;
315:     }
316:     if (usecprow) z = zarray + 2*ridx[i];
317:     z[0] = sum1; z[1] = sum2;
318:     if (!usecprow) z += 2;
319:   }
320:   VecRestoreArrayRead(xx,&x);
321:   VecRestoreArray(zz,&zarray);
322:   PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);
323:   return(0);
324: }

328: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329: {
330:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
331:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332:   const PetscScalar *x,*xb;
333:   const MatScalar   *v;
334:   PetscErrorCode    ierr;
335:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
336:   PetscBool         usecprow=a->compressedrow.use;


339: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340: #pragma disjoint(*v,*z,*xb)
341: #endif

344:   VecGetArrayRead(xx,&x);
345:   VecGetArray(zz,&zarray);

347:   idx = a->j;
348:   v   = a->a;
349:   if (usecprow) {
350:     mbs  = a->compressedrow.nrows;
351:     ii   = a->compressedrow.i;
352:     ridx = a->compressedrow.rindex;
353:   } else {
354:     mbs = a->mbs;
355:     ii  = a->i;
356:     z   = zarray;
357:   }

359:   for (i=0; i<mbs; i++) {
360:     n           = ii[1] - ii[0]; ii++;
361:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
362:     nonzerorow += (n>0);
363:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
364:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
365:     for (j=0; j<n; j++) {
366:       xb = x + 3*(*idx++);
367:       x1 = xb[0];
368:       x2 = xb[1];
369:       x3 = xb[2];

371:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
372:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
373:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
374:       v    += 9;
375:     }
376:     if (usecprow) z = zarray + 3*ridx[i];
377:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
378:     if (!usecprow) z += 3;
379:   }
380:   VecRestoreArrayRead(xx,&x);
381:   VecRestoreArray(zz,&zarray);
382:   PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);
383:   return(0);
384: }

388: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
389: {
390:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
391:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
392:   const PetscScalar *x,*xb;
393:   const MatScalar   *v;
394:   PetscErrorCode    ierr;
395:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
396:   PetscBool         usecprow=a->compressedrow.use;

399:   VecGetArrayRead(xx,&x);
400:   VecGetArray(zz,&zarray);

402:   idx = a->j;
403:   v   = a->a;
404:   if (usecprow) {
405:     mbs  = a->compressedrow.nrows;
406:     ii   = a->compressedrow.i;
407:     ridx = a->compressedrow.rindex;
408:   } else {
409:     mbs = a->mbs;
410:     ii  = a->i;
411:     z   = zarray;
412:   }

414:   for (i=0; i<mbs; i++) {
415:     n = ii[1] - ii[0];
416:     ii++;
417:     sum1 = 0.0;
418:     sum2 = 0.0;
419:     sum3 = 0.0;
420:     sum4 = 0.0;

422:     nonzerorow += (n>0);
423:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
424:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
425:     for (j=0; j<n; j++) {
426:       xb    = x + 4*(*idx++);
427:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
428:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
429:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
430:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
431:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
432:       v    += 16;
433:     }
434:     if (usecprow) z = zarray + 4*ridx[i];
435:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
436:     if (!usecprow) z += 4;
437:   }
438:   VecRestoreArrayRead(xx,&x);
439:   VecRestoreArray(zz,&zarray);
440:   PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);
441:   return(0);
442: }

446: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
447: {
448:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
449:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
450:   const PetscScalar *xb,*x;
451:   const MatScalar   *v;
452:   PetscErrorCode    ierr;
453:   const PetscInt    *idx,*ii,*ridx=NULL;
454:   PetscInt          mbs,i,j,n,nonzerorow=0;
455:   PetscBool         usecprow=a->compressedrow.use;

458:   VecGetArrayRead(xx,&x);
459:   VecGetArray(zz,&zarray);

461:   idx = a->j;
462:   v   = a->a;
463:   if (usecprow) {
464:     mbs  = a->compressedrow.nrows;
465:     ii   = a->compressedrow.i;
466:     ridx = a->compressedrow.rindex;
467:   } else {
468:     mbs = a->mbs;
469:     ii  = a->i;
470:     z   = zarray;
471:   }

473:   for (i=0; i<mbs; i++) {
474:     n           = ii[1] - ii[0]; ii++;
475:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
476:     nonzerorow += (n>0);
477:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
478:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479:     for (j=0; j<n; j++) {
480:       xb    = x + 5*(*idx++);
481:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
482:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
483:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
484:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
485:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
486:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
487:       v    += 25;
488:     }
489:     if (usecprow) z = zarray + 5*ridx[i];
490:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
491:     if (!usecprow) z += 5;
492:   }
493:   VecRestoreArrayRead(xx,&x);
494:   VecRestoreArray(zz,&zarray);
495:   PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
496:   return(0);
497: }


502: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
503: {
504:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
505:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
506:   const PetscScalar *x,*xb;
507:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
508:   const MatScalar   *v;
509:   PetscErrorCode    ierr;
510:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
511:   PetscBool         usecprow=a->compressedrow.use;

514:   VecGetArrayRead(xx,&x);
515:   VecGetArray(zz,&zarray);

517:   idx = a->j;
518:   v   = a->a;
519:   if (usecprow) {
520:     mbs  = a->compressedrow.nrows;
521:     ii   = a->compressedrow.i;
522:     ridx = a->compressedrow.rindex;
523:   } else {
524:     mbs = a->mbs;
525:     ii  = a->i;
526:     z   = zarray;
527:   }

529:   for (i=0; i<mbs; i++) {
530:     n  = ii[1] - ii[0];
531:     ii++;
532:     sum1 = 0.0;
533:     sum2 = 0.0;
534:     sum3 = 0.0;
535:     sum4 = 0.0;
536:     sum5 = 0.0;
537:     sum6 = 0.0;

539:     nonzerorow += (n>0);
540:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
541:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
542:     for (j=0; j<n; j++) {
543:       xb    = x + 6*(*idx++);
544:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
545:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
546:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
547:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
548:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
549:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
550:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
551:       v    += 36;
552:     }
553:     if (usecprow) z = zarray + 6*ridx[i];
554:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
555:     if (!usecprow) z += 6;
556:   }

558:   VecRestoreArrayRead(xx,&x);
559:   VecRestoreArray(zz,&zarray);
560:   PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);
561:   return(0);
562: }

566: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
567: {
568:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
569:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
570:   const PetscScalar *x,*xb;
571:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
572:   const MatScalar   *v;
573:   PetscErrorCode    ierr;
574:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
575:   PetscBool         usecprow=a->compressedrow.use;

578:   VecGetArrayRead(xx,&x);
579:   VecGetArray(zz,&zarray);

581:   idx = a->j;
582:   v   = a->a;
583:   if (usecprow) {
584:     mbs  = a->compressedrow.nrows;
585:     ii   = a->compressedrow.i;
586:     ridx = a->compressedrow.rindex;
587:   } else {
588:     mbs = a->mbs;
589:     ii  = a->i;
590:     z   = zarray;
591:   }

593:   for (i=0; i<mbs; i++) {
594:     n  = ii[1] - ii[0];
595:     ii++;
596:     sum1 = 0.0;
597:     sum2 = 0.0;
598:     sum3 = 0.0;
599:     sum4 = 0.0;
600:     sum5 = 0.0;
601:     sum6 = 0.0;
602:     sum7 = 0.0;

604:     nonzerorow += (n>0);
605:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
606:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
607:     for (j=0; j<n; j++) {
608:       xb    = x + 7*(*idx++);
609:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
610:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
611:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
612:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
613:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
614:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
615:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
616:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
617:       v    += 49;
618:     }
619:     if (usecprow) z = zarray + 7*ridx[i];
620:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
621:     if (!usecprow) z += 7;
622:   }

624:   VecRestoreArrayRead(xx,&x);
625:   VecRestoreArray(zz,&zarray);
626:   PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);
627:   return(0);
628: }

630: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
631: /* Default MatMult for block size 15 */

635: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
636: {
637:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
638:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
639:   const PetscScalar *x,*xb;
640:   PetscScalar       *zarray,xv;
641:   const MatScalar   *v;
642:   PetscErrorCode    ierr;
643:   const PetscInt    *ii,*ij=a->j,*idx;
644:   PetscInt          mbs,i,j,k,n,*ridx=NULL,nonzerorow=0;
645:   PetscBool         usecprow=a->compressedrow.use;

648:   VecGetArrayRead(xx,&x);
649:   VecGetArray(zz,&zarray);

651:   v = a->a;
652:   if (usecprow) {
653:     mbs  = a->compressedrow.nrows;
654:     ii   = a->compressedrow.i;
655:     ridx = a->compressedrow.rindex;
656:   } else {
657:     mbs = a->mbs;
658:     ii  = a->i;
659:     z   = zarray;
660:   }

662:   for (i=0; i<mbs; i++) {
663:     n    = ii[i+1] - ii[i];
664:     idx  = ij + ii[i];
665:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
666:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

668:     nonzerorow += (n>0);
669:     for (j=0; j<n; j++) {
670:       xb = x + 15*(idx[j]);

672:       for (k=0; k<15; k++) {
673:         xv     =  xb[k];
674:         sum1  += v[0]*xv;
675:         sum2  += v[1]*xv;
676:         sum3  += v[2]*xv;
677:         sum4  += v[3]*xv;
678:         sum5  += v[4]*xv;
679:         sum6  += v[5]*xv;
680:         sum7  += v[6]*xv;
681:         sum8  += v[7]*xv;
682:         sum9  += v[8]*xv;
683:         sum10 += v[9]*xv;
684:         sum11 += v[10]*xv;
685:         sum12 += v[11]*xv;
686:         sum13 += v[12]*xv;
687:         sum14 += v[13]*xv;
688:         sum15 += v[14]*xv;
689:         v     += 15;
690:       }
691:     }
692:     if (usecprow) z = zarray + 15*ridx[i];
693:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
694:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

696:     if (!usecprow) z += 15;
697:   }

699:   VecRestoreArrayRead(xx,&x);
700:   VecRestoreArray(zz,&zarray);
701:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
702:   return(0);
703: }

705: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
708: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
709: {
710:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
711:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
712:   const PetscScalar *x,*xb;
713:   PetscScalar       x1,x2,x3,x4,*zarray;
714:   const MatScalar   *v;
715:   PetscErrorCode    ierr;
716:   const PetscInt    *ii,*ij=a->j,*idx;
717:   PetscInt          mbs,i,j,n,*ridx=NULL,nonzerorow=0;
718:   PetscBool         usecprow=a->compressedrow.use;

721:   VecGetArrayRead(xx,&x);
722:   VecGetArray(zz,&zarray);

724:   v = a->a;
725:   if (usecprow) {
726:     mbs  = a->compressedrow.nrows;
727:     ii   = a->compressedrow.i;
728:     ridx = a->compressedrow.rindex;
729:   } else {
730:     mbs = a->mbs;
731:     ii  = a->i;
732:     z   = zarray;
733:   }

735:   for (i=0; i<mbs; i++) {
736:     n    = ii[i+1] - ii[i];
737:     idx  = ij + ii[i];
738:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
739:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

741:     nonzerorow += (n>0);
742:     for (j=0; j<n; j++) {
743:       xb = x + 15*(idx[j]);
744:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

746:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
747:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
748:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
749:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
750:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
751:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
752:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
753:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
754:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
755:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
756:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
757:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
758:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
759:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
760:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

762:       v += 60;

764:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

766:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
767:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
768:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
769:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
770:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
771:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
772:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
773:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
774:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
775:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
776:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
777:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
778:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
779:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
780:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
781:       v     += 60;

783:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
784:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
785:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
786:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
787:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
788:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
789:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
790:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
791:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
792:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
793:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
794:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
795:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
796:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
797:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
798:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
799:       v     += 60;

801:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
802:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
803:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
804:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
805:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
806:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
807:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
808:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
809:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
810:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
811:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
812:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
813:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
814:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
815:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
816:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
817:       v     += 45;
818:     }
819:     if (usecprow) z = zarray + 15*ridx[i];
820:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
821:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

823:     if (!usecprow) z += 15;
824:   }

826:   VecRestoreArrayRead(xx,&x);
827:   VecRestoreArray(zz,&zarray);
828:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
829:   return(0);
830: }

832: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
835: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
836: {
837:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
838:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
839:   const PetscScalar *x,*xb;
840:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
841:   const MatScalar   *v;
842:   PetscErrorCode    ierr;
843:   const PetscInt    *ii,*ij=a->j,*idx;
844:   PetscInt          mbs,i,j,n,*ridx=NULL,nonzerorow=0;
845:   PetscBool         usecprow=a->compressedrow.use;

848:   VecGetArrayRead(xx,&x);
849:   VecGetArray(zz,&zarray);

851:   v = a->a;
852:   if (usecprow) {
853:     mbs  = a->compressedrow.nrows;
854:     ii   = a->compressedrow.i;
855:     ridx = a->compressedrow.rindex;
856:   } else {
857:     mbs = a->mbs;
858:     ii  = a->i;
859:     z   = zarray;
860:   }

862:   for (i=0; i<mbs; i++) {
863:     n    = ii[i+1] - ii[i];
864:     idx  = ij + ii[i];
865:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
866:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

868:     nonzerorow += (n>0);
869:     for (j=0; j<n; j++) {
870:       xb = x + 15*(idx[j]);
871:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
872:       x8 = xb[7];

874:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
875:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
876:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
877:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
878:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
879:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
880:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
881:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
882:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
883:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
884:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
885:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
886:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
887:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
888:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
889:       v     += 120;

891:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

893:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
894:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
895:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
896:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
897:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
898:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
899:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
900:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
901:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
902:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
903:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
904:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
905:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
906:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
907:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
908:       v     += 105;
909:     }
910:     if (usecprow) z = zarray + 15*ridx[i];
911:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
912:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

914:     if (!usecprow) z += 15;
915:   }

917:   VecRestoreArrayRead(xx,&x);
918:   VecRestoreArray(zz,&zarray);
919:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
920:   return(0);
921: }

923: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

927: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
928: {
929:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931:   const PetscScalar *x,*xb;
932:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
933:   const MatScalar   *v;
934:   PetscErrorCode    ierr;
935:   const PetscInt    *ii,*ij=a->j,*idx;
936:   PetscInt          mbs,i,j,n,*ridx=NULL,nonzerorow=0;
937:   PetscBool         usecprow=a->compressedrow.use;

940:   VecGetArrayRead(xx,&x);
941:   VecGetArray(zz,&zarray);

943:   v = a->a;
944:   if (usecprow) {
945:     mbs  = a->compressedrow.nrows;
946:     ii   = a->compressedrow.i;
947:     ridx = a->compressedrow.rindex;
948:   } else {
949:     mbs = a->mbs;
950:     ii  = a->i;
951:     z   = zarray;
952:   }

954:   for (i=0; i<mbs; i++) {
955:     n    = ii[i+1] - ii[i];
956:     idx  = ij + ii[i];
957:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

960:     nonzerorow += (n>0);
961:     for (j=0; j<n; j++) {
962:       xb = x + 15*(idx[j]);
963:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

966:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
967:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
968:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
969:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
970:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
971:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
972:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
973:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
974:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
975:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
976:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
977:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
978:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
979:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
980:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
981:       v     += 225;
982:     }
983:     if (usecprow) z = zarray + 15*ridx[i];
984:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
985:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

987:     if (!usecprow) z += 15;
988:   }

990:   VecRestoreArrayRead(xx,&x);
991:   VecRestoreArray(zz,&zarray);
992:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
993:   return(0);
994: }


997: /*
998:     This will not work with MatScalar == float because it calls the BLAS
999: */
1002: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1003: {
1004:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1005:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1006:   MatScalar      *v;
1008:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1009:   PetscInt       ncols,k,*ridx=NULL,nonzerorow=0;
1010:   PetscBool      usecprow=a->compressedrow.use;

1013:   VecGetArray(xx,&x);
1014:   VecGetArray(zz,&zarray);

1016:   idx = a->j;
1017:   v   = a->a;
1018:   if (usecprow) {
1019:     mbs  = a->compressedrow.nrows;
1020:     ii   = a->compressedrow.i;
1021:     ridx = a->compressedrow.rindex;
1022:   } else {
1023:     mbs = a->mbs;
1024:     ii  = a->i;
1025:     z   = zarray;
1026:   }

1028:   if (!a->mult_work) {
1029:     k    = PetscMax(A->rmap->n,A->cmap->n);
1030:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1031:   }
1032:   work = a->mult_work;
1033:   for (i=0; i<mbs; i++) {
1034:     n           = ii[1] - ii[0]; ii++;
1035:     ncols       = n*bs;
1036:     workt       = work;
1037:     nonzerorow += (n>0);
1038:     for (j=0; j<n; j++) {
1039:       xb = x + bs*(*idx++);
1040:       for (k=0; k<bs; k++) workt[k] = xb[k];
1041:       workt += bs;
1042:     }
1043:     if (usecprow) z = zarray + bs*ridx[i];
1044:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1045:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1046:     v += n*bs2;
1047:     if (!usecprow) z += bs;
1048:   }
1049:   VecRestoreArray(xx,&x);
1050:   VecRestoreArray(zz,&zarray);
1051:   PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);
1052:   return(0);
1053: }

1057: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1058: {
1059:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1060:   const PetscScalar *x;
1061:   PetscScalar       *y,*z,sum;
1062:   const MatScalar   *v;
1063:   PetscErrorCode    ierr;
1064:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL,nonzerorow=0;
1065:   const PetscInt    *idx,*ii;
1066:   PetscBool         usecprow=a->compressedrow.use;

1069:   VecGetArrayRead(xx,&x);
1070:   VecGetArray(yy,&y);
1071:   if (zz != yy) {
1072:     VecGetArray(zz,&z);
1073:   } else {
1074:     z = y;
1075:   }

1077:   idx = a->j;
1078:   v   = a->a;
1079:   if (usecprow) {
1080:     if (zz != yy) {
1081:       PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1082:     }
1083:     mbs  = a->compressedrow.nrows;
1084:     ii   = a->compressedrow.i;
1085:     ridx = a->compressedrow.rindex;
1086:   } else {
1087:     ii = a->i;
1088:   }

1090:   for (i=0; i<mbs; i++) {
1091:     n = ii[1] - ii[0];
1092:     ii++;
1093:     if (!usecprow) {
1094:       nonzerorow += (n>0);
1095:       sum         = y[i];
1096:     } else {
1097:       sum = y[ridx[i]];
1098:     }
1099:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1100:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1101:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1102:     v   += n;
1103:     idx += n;
1104:     if (usecprow) {
1105:       z[ridx[i]] = sum;
1106:     } else {
1107:       z[i] = sum;
1108:     }
1109:   }
1110:   VecRestoreArrayRead(xx,&x);
1111:   VecRestoreArray(yy,&y);
1112:   if (zz != yy) {
1113:     VecRestoreArray(zz,&z);
1114:   }
1115:   PetscLogFlops(2.0*a->nz - nonzerorow);
1116:   return(0);
1117: }

1121: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122: {
1123:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1124:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1125:   PetscScalar    x1,x2,*yarray,*zarray;
1126:   MatScalar      *v;
1128:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1129:   PetscBool      usecprow=a->compressedrow.use;

1132:   VecGetArray(xx,&x);
1133:   VecGetArray(yy,&yarray);
1134:   if (zz != yy) {
1135:     VecGetArray(zz,&zarray);
1136:   } else {
1137:     zarray = yarray;
1138:   }

1140:   idx = a->j;
1141:   v   = a->a;
1142:   if (usecprow) {
1143:     if (zz != yy) {
1144:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1145:     }
1146:     mbs  = a->compressedrow.nrows;
1147:     ii   = a->compressedrow.i;
1148:     ridx = a->compressedrow.rindex;
1149:     if (zz != yy) {
1150:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1151:     }
1152:   } else {
1153:     ii = a->i;
1154:     y  = yarray;
1155:     z  = zarray;
1156:   }

1158:   for (i=0; i<mbs; i++) {
1159:     n = ii[1] - ii[0]; ii++;
1160:     if (usecprow) {
1161:       z = zarray + 2*ridx[i];
1162:       y = yarray + 2*ridx[i];
1163:     }
1164:     sum1 = y[0]; sum2 = y[1];
1165:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1166:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1167:     for (j=0; j<n; j++) {
1168:       xb = x + 2*(*idx++);
1169:       x1 = xb[0];
1170:       x2 = xb[1];

1172:       sum1 += v[0]*x1 + v[2]*x2;
1173:       sum2 += v[1]*x1 + v[3]*x2;
1174:       v    += 4;
1175:     }
1176:     z[0] = sum1; z[1] = sum2;
1177:     if (!usecprow) {
1178:       z += 2; y += 2;
1179:     }
1180:   }
1181:   VecRestoreArray(xx,&x);
1182:   VecRestoreArray(yy,&yarray);
1183:   if (zz != yy) {
1184:     VecRestoreArray(zz,&zarray);
1185:   }
1186:   PetscLogFlops(4.0*a->nz);
1187:   return(0);
1188: }

1192: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1193: {
1194:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1195:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1196:   MatScalar      *v;
1198:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1199:   PetscBool      usecprow=a->compressedrow.use;

1202:   VecGetArray(xx,&x);
1203:   VecGetArray(yy,&yarray);
1204:   if (zz != yy) {
1205:     VecGetArray(zz,&zarray);
1206:   } else {
1207:     zarray = yarray;
1208:   }

1210:   idx = a->j;
1211:   v   = a->a;
1212:   if (usecprow) {
1213:     if (zz != yy) {
1214:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1215:     }
1216:     mbs  = a->compressedrow.nrows;
1217:     ii   = a->compressedrow.i;
1218:     ridx = a->compressedrow.rindex;
1219:   } else {
1220:     ii = a->i;
1221:     y  = yarray;
1222:     z  = zarray;
1223:   }

1225:   for (i=0; i<mbs; i++) {
1226:     n = ii[1] - ii[0]; ii++;
1227:     if (usecprow) {
1228:       z = zarray + 3*ridx[i];
1229:       y = yarray + 3*ridx[i];
1230:     }
1231:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1232:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1233:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1234:     for (j=0; j<n; j++) {
1235:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1236:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1237:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1238:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1239:       v    += 9;
1240:     }
1241:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1242:     if (!usecprow) {
1243:       z += 3; y += 3;
1244:     }
1245:   }
1246:   VecRestoreArray(xx,&x);
1247:   VecRestoreArray(yy,&yarray);
1248:   if (zz != yy) {
1249:     VecRestoreArray(zz,&zarray);
1250:   }
1251:   PetscLogFlops(18.0*a->nz);
1252:   return(0);
1253: }

1257: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1258: {
1259:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1260:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1261:   MatScalar      *v;
1263:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1264:   PetscBool      usecprow=a->compressedrow.use;

1267:   VecGetArray(xx,&x);
1268:   VecGetArray(yy,&yarray);
1269:   if (zz != yy) {
1270:     VecGetArray(zz,&zarray);
1271:   } else {
1272:     zarray = yarray;
1273:   }

1275:   idx = a->j;
1276:   v   = a->a;
1277:   if (usecprow) {
1278:     if (zz != yy) {
1279:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1280:     }
1281:     mbs  = a->compressedrow.nrows;
1282:     ii   = a->compressedrow.i;
1283:     ridx = a->compressedrow.rindex;
1284:   } else {
1285:     ii = a->i;
1286:     y  = yarray;
1287:     z  = zarray;
1288:   }

1290:   for (i=0; i<mbs; i++) {
1291:     n = ii[1] - ii[0]; ii++;
1292:     if (usecprow) {
1293:       z = zarray + 4*ridx[i];
1294:       y = yarray + 4*ridx[i];
1295:     }
1296:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1297:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1298:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299:     for (j=0; j<n; j++) {
1300:       xb    = x + 4*(*idx++);
1301:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1302:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1303:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1304:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1305:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1306:       v    += 16;
1307:     }
1308:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1309:     if (!usecprow) {
1310:       z += 4; y += 4;
1311:     }
1312:   }
1313:   VecRestoreArray(xx,&x);
1314:   VecRestoreArray(yy,&yarray);
1315:   if (zz != yy) {
1316:     VecRestoreArray(zz,&zarray);
1317:   }
1318:   PetscLogFlops(32.0*a->nz);
1319:   return(0);
1320: }

1324: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1325: {
1326:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1327:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1328:   PetscScalar    *yarray,*zarray;
1329:   MatScalar      *v;
1331:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1332:   PetscBool      usecprow=a->compressedrow.use;

1335:   VecGetArray(xx,&x);
1336:   VecGetArray(yy,&yarray);
1337:   if (zz != yy) {
1338:     VecGetArray(zz,&zarray);
1339:   } else {
1340:     zarray = yarray;
1341:   }

1343:   idx = a->j;
1344:   v   = a->a;
1345:   if (usecprow) {
1346:     if (zz != yy) {
1347:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1348:     }
1349:     mbs  = a->compressedrow.nrows;
1350:     ii   = a->compressedrow.i;
1351:     ridx = a->compressedrow.rindex;
1352:   } else {
1353:     ii = a->i;
1354:     y  = yarray;
1355:     z  = zarray;
1356:   }

1358:   for (i=0; i<mbs; i++) {
1359:     n = ii[1] - ii[0]; ii++;
1360:     if (usecprow) {
1361:       z = zarray + 5*ridx[i];
1362:       y = yarray + 5*ridx[i];
1363:     }
1364:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1365:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1366:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1367:     for (j=0; j<n; j++) {
1368:       xb    = x + 5*(*idx++);
1369:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1370:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1371:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1372:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1373:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1374:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1375:       v    += 25;
1376:     }
1377:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1378:     if (!usecprow) {
1379:       z += 5; y += 5;
1380:     }
1381:   }
1382:   VecRestoreArray(xx,&x);
1383:   VecRestoreArray(yy,&yarray);
1384:   if (zz != yy) {
1385:     VecRestoreArray(zz,&zarray);
1386:   }
1387:   PetscLogFlops(50.0*a->nz);
1388:   return(0);
1389: }
1392: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1393: {
1394:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1395:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1396:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1397:   MatScalar      *v;
1399:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1400:   PetscBool      usecprow=a->compressedrow.use;

1403:   VecGetArray(xx,&x);
1404:   VecGetArray(yy,&yarray);
1405:   if (zz != yy) {
1406:     VecGetArray(zz,&zarray);
1407:   } else {
1408:     zarray = yarray;
1409:   }

1411:   idx = a->j;
1412:   v   = a->a;
1413:   if (usecprow) {
1414:     if (zz != yy) {
1415:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1416:     }
1417:     mbs  = a->compressedrow.nrows;
1418:     ii   = a->compressedrow.i;
1419:     ridx = a->compressedrow.rindex;
1420:   } else {
1421:     ii = a->i;
1422:     y  = yarray;
1423:     z  = zarray;
1424:   }

1426:   for (i=0; i<mbs; i++) {
1427:     n = ii[1] - ii[0]; ii++;
1428:     if (usecprow) {
1429:       z = zarray + 6*ridx[i];
1430:       y = yarray + 6*ridx[i];
1431:     }
1432:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1433:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1434:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1435:     for (j=0; j<n; j++) {
1436:       xb    = x + 6*(*idx++);
1437:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1438:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1439:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1440:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1441:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1442:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1443:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1444:       v    += 36;
1445:     }
1446:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1447:     if (!usecprow) {
1448:       z += 6; y += 6;
1449:     }
1450:   }
1451:   VecRestoreArray(xx,&x);
1452:   VecRestoreArray(yy,&yarray);
1453:   if (zz != yy) {
1454:     VecRestoreArray(zz,&zarray);
1455:   }
1456:   PetscLogFlops(72.0*a->nz);
1457:   return(0);
1458: }

1462: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1463: {
1464:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1465:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1466:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1467:   MatScalar      *v;
1469:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1470:   PetscBool      usecprow=a->compressedrow.use;

1473:   VecGetArray(xx,&x);
1474:   VecGetArray(yy,&yarray);
1475:   if (zz != yy) {
1476:     VecGetArray(zz,&zarray);
1477:   } else {
1478:     zarray = yarray;
1479:   }

1481:   idx = a->j;
1482:   v   = a->a;
1483:   if (usecprow) {
1484:     if (zz != yy) {
1485:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1486:     }
1487:     mbs  = a->compressedrow.nrows;
1488:     ii   = a->compressedrow.i;
1489:     ridx = a->compressedrow.rindex;
1490:   } else {
1491:     ii = a->i;
1492:     y  = yarray;
1493:     z  = zarray;
1494:   }

1496:   for (i=0; i<mbs; i++) {
1497:     n = ii[1] - ii[0]; ii++;
1498:     if (usecprow) {
1499:       z = zarray + 7*ridx[i];
1500:       y = yarray + 7*ridx[i];
1501:     }
1502:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1503:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1504:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1505:     for (j=0; j<n; j++) {
1506:       xb    = x + 7*(*idx++);
1507:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1508:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1509:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1510:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1511:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1512:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1513:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1514:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1515:       v    += 49;
1516:     }
1517:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1518:     if (!usecprow) {
1519:       z += 7; y += 7;
1520:     }
1521:   }
1522:   VecRestoreArray(xx,&x);
1523:   VecRestoreArray(yy,&yarray);
1524:   if (zz != yy) {
1525:     VecRestoreArray(zz,&zarray);
1526:   }
1527:   PetscLogFlops(98.0*a->nz);
1528:   return(0);
1529: }

1533: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1534: {
1535:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1536:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1537:   MatScalar      *v;
1539:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1540:   PetscInt       ncols,k,*ridx=NULL;
1541:   PetscBool      usecprow=a->compressedrow.use;

1544:   VecCopy(yy,zz);
1545:   VecGetArray(xx,&x);
1546:   VecGetArray(zz,&zarray);

1548:   idx = a->j;
1549:   v   = a->a;
1550:   if (usecprow) {
1551:     mbs  = a->compressedrow.nrows;
1552:     ii   = a->compressedrow.i;
1553:     ridx = a->compressedrow.rindex;
1554:   } else {
1555:     mbs = a->mbs;
1556:     ii  = a->i;
1557:     z   = zarray;
1558:   }

1560:   if (!a->mult_work) {
1561:     k    = PetscMax(A->rmap->n,A->cmap->n);
1562:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1563:   }
1564:   work = a->mult_work;
1565:   for (i=0; i<mbs; i++) {
1566:     n     = ii[1] - ii[0]; ii++;
1567:     ncols = n*bs;
1568:     workt = work;
1569:     for (j=0; j<n; j++) {
1570:       xb = x + bs*(*idx++);
1571:       for (k=0; k<bs; k++) workt[k] = xb[k];
1572:       workt += bs;
1573:     }
1574:     if (usecprow) z = zarray + bs*ridx[i];
1575:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1576:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1577:     v += n*bs2;
1578:     if (!usecprow) z += bs;
1579:   }
1580:   VecRestoreArray(xx,&x);
1581:   VecRestoreArray(zz,&zarray);
1582:   PetscLogFlops(2.0*a->nz*bs2);
1583:   return(0);
1584: }

1588: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1589: {
1590:   PetscScalar    zero = 0.0;

1594:   VecSet(zz,zero);
1595:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1596:   return(0);
1597: }

1601: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1602: {
1603:   PetscScalar    zero = 0.0;

1607:   VecSet(zz,zero);
1608:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1609:   return(0);
1610: }

1614: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1615: {
1616:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1617:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1618:   MatScalar         *v;
1619:   PetscErrorCode    ierr;
1620:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1621:   Mat_CompressedRow cprow   = a->compressedrow;
1622:   PetscBool         usecprow=cprow.use;

1625:   if (yy != zz) { VecCopy(yy,zz); }
1626:   VecGetArray(xx,&x);
1627:   VecGetArray(zz,&z);

1629:   idx = a->j;
1630:   v   = a->a;
1631:   if (usecprow) {
1632:     mbs  = cprow.nrows;
1633:     ii   = cprow.i;
1634:     ridx = cprow.rindex;
1635:   } else {
1636:     mbs=a->mbs;
1637:     ii = a->i;
1638:     xb = x;
1639:   }

1641:   switch (bs) {
1642:   case 1:
1643:     for (i=0; i<mbs; i++) {
1644:       if (usecprow) xb = x + ridx[i];
1645:       x1 = xb[0];
1646:       ib = idx + ii[0];
1647:       n  = ii[1] - ii[0]; ii++;
1648:       for (j=0; j<n; j++) {
1649:         rval     = ib[j];
1650:         z[rval] += PetscConj(*v) * x1;
1651:         v++;
1652:       }
1653:       if (!usecprow) xb++;
1654:     }
1655:     break;
1656:   case 2:
1657:     for (i=0; i<mbs; i++) {
1658:       if (usecprow) xb = x + 2*ridx[i];
1659:       x1 = xb[0]; x2 = xb[1];
1660:       ib = idx + ii[0];
1661:       n  = ii[1] - ii[0]; ii++;
1662:       for (j=0; j<n; j++) {
1663:         rval       = ib[j]*2;
1664:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1665:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1666:         v         += 4;
1667:       }
1668:       if (!usecprow) xb += 2;
1669:     }
1670:     break;
1671:   case 3:
1672:     for (i=0; i<mbs; i++) {
1673:       if (usecprow) xb = x + 3*ridx[i];
1674:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1675:       ib = idx + ii[0];
1676:       n  = ii[1] - ii[0]; ii++;
1677:       for (j=0; j<n; j++) {
1678:         rval       = ib[j]*3;
1679:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1680:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1681:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1682:         v         += 9;
1683:       }
1684:       if (!usecprow) xb += 3;
1685:     }
1686:     break;
1687:   case 4:
1688:     for (i=0; i<mbs; i++) {
1689:       if (usecprow) xb = x + 4*ridx[i];
1690:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1691:       ib = idx + ii[0];
1692:       n  = ii[1] - ii[0]; ii++;
1693:       for (j=0; j<n; j++) {
1694:         rval       = ib[j]*4;
1695:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1696:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1697:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1698:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1699:         v         += 16;
1700:       }
1701:       if (!usecprow) xb += 4;
1702:     }
1703:     break;
1704:   case 5:
1705:     for (i=0; i<mbs; i++) {
1706:       if (usecprow) xb = x + 5*ridx[i];
1707:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1708:       x4 = xb[3]; x5 = xb[4];
1709:       ib = idx + ii[0];
1710:       n  = ii[1] - ii[0]; ii++;
1711:       for (j=0; j<n; j++) {
1712:         rval       = ib[j]*5;
1713:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1714:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1715:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1716:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1717:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1718:         v         += 25;
1719:       }
1720:       if (!usecprow) xb += 5;
1721:     }
1722:     break;
1723:   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1724:     PetscInt    ncols,k;
1725:     PetscScalar *work,*workt,*xtmp;

1727:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1728:     if (!a->mult_work) {
1729:       k    = PetscMax(A->rmap->n,A->cmap->n);
1730:       PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1731:     }
1732:     work = a->mult_work;
1733:     xtmp = x;
1734:     for (i=0; i<mbs; i++) {
1735:       n     = ii[1] - ii[0]; ii++;
1736:       ncols = n*bs;
1737:       PetscMemzero(work,ncols*sizeof(PetscScalar));
1738:       if (usecprow) xtmp = x + bs*ridx[i];
1739:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1740:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1741:       v += n*bs2;
1742:       if (!usecprow) xtmp += bs;
1743:       workt = work;
1744:       for (j=0; j<n; j++) {
1745:         zb = z + bs*(*idx++);
1746:         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1747:         workt += bs;
1748:       }
1749:     }
1750:     }
1751:   }
1752:   VecRestoreArray(xx,&x);
1753:   VecRestoreArray(zz,&z);
1754:   PetscLogFlops(2.0*a->nz*a->bs2);
1755:   return(0);
1756: }

1760: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1761: {
1762:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1763:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1764:   MatScalar         *v;
1765:   PetscErrorCode    ierr;
1766:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1767:   Mat_CompressedRow cprow   = a->compressedrow;
1768:   PetscBool         usecprow=cprow.use;

1771:   if (yy != zz) { VecCopy(yy,zz); }
1772:   VecGetArray(xx,&x);
1773:   VecGetArray(zz,&z);

1775:   idx = a->j;
1776:   v   = a->a;
1777:   if (usecprow) {
1778:     mbs  = cprow.nrows;
1779:     ii   = cprow.i;
1780:     ridx = cprow.rindex;
1781:   } else {
1782:     mbs=a->mbs;
1783:     ii = a->i;
1784:     xb = x;
1785:   }

1787:   switch (bs) {
1788:   case 1:
1789:     for (i=0; i<mbs; i++) {
1790:       if (usecprow) xb = x + ridx[i];
1791:       x1 = xb[0];
1792:       ib = idx + ii[0];
1793:       n  = ii[1] - ii[0]; ii++;
1794:       for (j=0; j<n; j++) {
1795:         rval     = ib[j];
1796:         z[rval] += *v * x1;
1797:         v++;
1798:       }
1799:       if (!usecprow) xb++;
1800:     }
1801:     break;
1802:   case 2:
1803:     for (i=0; i<mbs; i++) {
1804:       if (usecprow) xb = x + 2*ridx[i];
1805:       x1 = xb[0]; x2 = xb[1];
1806:       ib = idx + ii[0];
1807:       n  = ii[1] - ii[0]; ii++;
1808:       for (j=0; j<n; j++) {
1809:         rval       = ib[j]*2;
1810:         z[rval++] += v[0]*x1 + v[1]*x2;
1811:         z[rval++] += v[2]*x1 + v[3]*x2;
1812:         v         += 4;
1813:       }
1814:       if (!usecprow) xb += 2;
1815:     }
1816:     break;
1817:   case 3:
1818:     for (i=0; i<mbs; i++) {
1819:       if (usecprow) xb = x + 3*ridx[i];
1820:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1821:       ib = idx + ii[0];
1822:       n  = ii[1] - ii[0]; ii++;
1823:       for (j=0; j<n; j++) {
1824:         rval       = ib[j]*3;
1825:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1826:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1827:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1828:         v         += 9;
1829:       }
1830:       if (!usecprow) xb += 3;
1831:     }
1832:     break;
1833:   case 4:
1834:     for (i=0; i<mbs; i++) {
1835:       if (usecprow) xb = x + 4*ridx[i];
1836:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1837:       ib = idx + ii[0];
1838:       n  = ii[1] - ii[0]; ii++;
1839:       for (j=0; j<n; j++) {
1840:         rval       = ib[j]*4;
1841:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1842:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1843:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1844:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1845:         v         += 16;
1846:       }
1847:       if (!usecprow) xb += 4;
1848:     }
1849:     break;
1850:   case 5:
1851:     for (i=0; i<mbs; i++) {
1852:       if (usecprow) xb = x + 5*ridx[i];
1853:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1854:       x4 = xb[3]; x5 = xb[4];
1855:       ib = idx + ii[0];
1856:       n  = ii[1] - ii[0]; ii++;
1857:       for (j=0; j<n; j++) {
1858:         rval       = ib[j]*5;
1859:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1860:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1861:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1862:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1863:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1864:         v         += 25;
1865:       }
1866:       if (!usecprow) xb += 5;
1867:     }
1868:     break;
1869:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1870:     PetscInt    ncols,k;
1871:     PetscScalar *work,*workt,*xtmp;

1873:     if (!a->mult_work) {
1874:       k    = PetscMax(A->rmap->n,A->cmap->n);
1875:       PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1876:     }
1877:     work = a->mult_work;
1878:     xtmp = x;
1879:     for (i=0; i<mbs; i++) {
1880:       n     = ii[1] - ii[0]; ii++;
1881:       ncols = n*bs;
1882:       PetscMemzero(work,ncols*sizeof(PetscScalar));
1883:       if (usecprow) xtmp = x + bs*ridx[i];
1884:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1885:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1886:       v += n*bs2;
1887:       if (!usecprow) xtmp += bs;
1888:       workt = work;
1889:       for (j=0; j<n; j++) {
1890:         zb = z + bs*(*idx++);
1891:         for (k=0; k<bs; k++) zb[k] += workt[k];
1892:         workt += bs;
1893:       }
1894:     }
1895:     }
1896:   }
1897:   VecRestoreArray(xx,&x);
1898:   VecRestoreArray(zz,&z);
1899:   PetscLogFlops(2.0*a->nz*a->bs2);
1900:   return(0);
1901: }

1905: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1906: {
1907:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1908:   PetscInt       totalnz = a->bs2*a->nz;
1909:   PetscScalar    oalpha  = alpha;
1911:   PetscBLASInt   one = 1,tnz;

1914:   PetscBLASIntCast(totalnz,&tnz);
1915:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1916:   PetscLogFlops(totalnz);
1917:   return(0);
1918: }

1922: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1923: {
1925:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1926:   MatScalar      *v  = a->a;
1927:   PetscReal      sum = 0.0;
1928:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

1931:   if (type == NORM_FROBENIUS) {
1932:     for (i=0; i< bs2*nz; i++) {
1933:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1934:     }
1935:     *norm = PetscSqrtReal(sum);
1936:   } else if (type == NORM_1) { /* maximum column sum */
1937:     PetscReal *tmp;
1938:     PetscInt  *bcol = a->j;
1939:     PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1940:     PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1941:     for (i=0; i<nz; i++) {
1942:       for (j=0; j<bs; j++) {
1943:         k1 = bs*(*bcol) + j; /* column index */
1944:         for (k=0; k<bs; k++) {
1945:           tmp[k1] += PetscAbsScalar(*v); v++;
1946:         }
1947:       }
1948:       bcol++;
1949:     }
1950:     *norm = 0.0;
1951:     for (j=0; j<A->cmap->n; j++) {
1952:       if (tmp[j] > *norm) *norm = tmp[j];
1953:     }
1954:     PetscFree(tmp);
1955:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1956:     *norm = 0.0;
1957:     for (k=0; k<bs; k++) {
1958:       for (j=0; j<a->mbs; j++) {
1959:         v   = a->a + bs2*a->i[j] + k;
1960:         sum = 0.0;
1961:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1962:           for (k1=0; k1<bs; k1++) {
1963:             sum += PetscAbsScalar(*v);
1964:             v   += bs;
1965:           }
1966:         }
1967:         if (sum > *norm) *norm = sum;
1968:       }
1969:     }
1970:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1971:   return(0);
1972: }


1977: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1978: {
1979:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

1983:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1984:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1985:     *flg = PETSC_FALSE;
1986:     return(0);
1987:   }

1989:   /* if the a->i are the same */
1990:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1991:   if (!*flg) return(0);

1993:   /* if a->j are the same */
1994:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1995:   if (!*flg) return(0);

1997:   /* if a->a are the same */
1998:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1999:   return(0);

2001: }

2005: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2006: {
2007:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2009:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2010:   PetscScalar    *x,zero = 0.0;
2011:   MatScalar      *aa,*aa_j;

2014:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2015:   bs   = A->rmap->bs;
2016:   aa   = a->a;
2017:   ai   = a->i;
2018:   aj   = a->j;
2019:   ambs = a->mbs;
2020:   bs2  = a->bs2;

2022:   VecSet(v,zero);
2023:   VecGetArray(v,&x);
2024:   VecGetLocalSize(v,&n);
2025:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2026:   for (i=0; i<ambs; i++) {
2027:     for (j=ai[i]; j<ai[i+1]; j++) {
2028:       if (aj[j] == i) {
2029:         row  = i*bs;
2030:         aa_j = aa+j*bs2;
2031:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2032:         break;
2033:       }
2034:     }
2035:   }
2036:   VecRestoreArray(v,&x);
2037:   return(0);
2038: }

2042: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2043: {
2044:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2045:   const PetscScalar *l,*r,*li,*ri;
2046:   PetscScalar       x;
2047:   MatScalar         *aa, *v;
2048:   PetscErrorCode    ierr;
2049:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2050:   const PetscInt    *ai,*aj;

2053:   ai  = a->i;
2054:   aj  = a->j;
2055:   aa  = a->a;
2056:   m   = A->rmap->n;
2057:   n   = A->cmap->n;
2058:   bs  = A->rmap->bs;
2059:   mbs = a->mbs;
2060:   bs2 = a->bs2;
2061:   if (ll) {
2062:     VecGetArrayRead(ll,&l);
2063:     VecGetLocalSize(ll,&lm);
2064:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2065:     for (i=0; i<mbs; i++) { /* for each block row */
2066:       M  = ai[i+1] - ai[i];
2067:       li = l + i*bs;
2068:       v  = aa + bs2*ai[i];
2069:       for (j=0; j<M; j++) { /* for each block */
2070:         for (k=0; k<bs2; k++) {
2071:           (*v++) *= li[k%bs];
2072:         }
2073:       }
2074:     }
2075:     VecRestoreArrayRead(ll,&l);
2076:     PetscLogFlops(a->nz);
2077:   }

2079:   if (rr) {
2080:     VecGetArrayRead(rr,&r);
2081:     VecGetLocalSize(rr,&rn);
2082:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2083:     for (i=0; i<mbs; i++) { /* for each block row */
2084:       iai = ai[i];
2085:       M   = ai[i+1] - iai;
2086:       v   = aa + bs2*iai;
2087:       for (j=0; j<M; j++) { /* for each block */
2088:         ri = r + bs*aj[iai+j];
2089:         for (k=0; k<bs; k++) {
2090:           x = ri[k];
2091:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2092:           v += bs;
2093:         }
2094:       }
2095:     }
2096:     VecRestoreArrayRead(rr,&r);
2097:     PetscLogFlops(a->nz);
2098:   }
2099:   return(0);
2100: }


2105: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2106: {
2107:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2110:   info->block_size   = a->bs2;
2111:   info->nz_allocated = a->bs2*a->maxnz;
2112:   info->nz_used      = a->bs2*a->nz;
2113:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2114:   info->assemblies   = A->num_ass;
2115:   info->mallocs      = A->info.mallocs;
2116:   info->memory       = ((PetscObject)A)->mem;
2117:   if (A->factortype) {
2118:     info->fill_ratio_given  = A->info.fill_ratio_given;
2119:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2120:     info->factor_mallocs    = A->info.factor_mallocs;
2121:   } else {
2122:     info->fill_ratio_given  = 0;
2123:     info->fill_ratio_needed = 0;
2124:     info->factor_mallocs    = 0;
2125:   }
2126:   return(0);
2127: }


2132: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2133: {
2134:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2138:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2139:   return(0);
2140: }