Actual source code: baij2.c

petsc-3.8.4 2018-03-24
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <petsc/private/kernels/blockinvert.h>
  4:  #include <petscbt.h>
  5:  #include <petscblaslapack.h>

  7: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
  8: {
  9:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 11:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 12:   const PetscInt *idx;
 13:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 14:   PetscBT        table;

 17:   m  = a->mbs;
 18:   ai = a->i;
 19:   aj = a->j;
 20:   bs = A->rmap->bs;

 22:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 24:   PetscBTCreate(m,&table);
 25:   PetscMalloc1(m+1,&nidx);
 26:   PetscMalloc1(A->rmap->N+1,&nidx2);

 28:   for (i=0; i<is_max; i++) {
 29:     /* Initialise the two local arrays */
 30:     isz  = 0;
 31:     PetscBTMemzero(m,table);

 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 38:     for (j=0; j<n; ++j) {
 39:       ival = idx[j]/bs; /* convert the indices into block indices */
 40:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 41:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 42:     }
 43:     ISRestoreIndices(is[i],&idx);
 44:     ISDestroy(&is[i]);

 46:     k = 0;
 47:     for (j=0; j<ov; j++) { /* for each overlap*/
 48:       n = isz;
 49:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 50:         row   = nidx[k];
 51:         start = ai[row];
 52:         end   = ai[row+1];
 53:         for (l = start; l<end; l++) {
 54:           val = aj[l];
 55:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 56:         }
 57:       }
 58:     }
 59:     /* expand the Index Set */
 60:     for (j=0; j<isz; j++) {
 61:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 62:     }
 63:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 64:   }
 65:   PetscBTDestroy(&table);
 66:   PetscFree(nidx);
 67:   PetscFree(nidx2);
 68:   return(0);
 69: }

 71: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 72: {
 73:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 75:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 76:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 77:   const PetscInt *irow,*icol;
 78:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 79:   PetscInt       *aj = a->j,*ai = a->i;
 80:   MatScalar      *mat_a;
 81:   Mat            C;
 82:   PetscBool      flag;

 85:   ISGetIndices(isrow,&irow);
 86:   ISGetIndices(iscol,&icol);
 87:   ISGetLocalSize(isrow,&nrows);
 88:   ISGetLocalSize(iscol,&ncols);

 90:   PetscCalloc1(1+oldcols,&smap);
 91:   ssmap = smap;
 92:   PetscMalloc1(1+nrows,&lens);
 93:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 94:   /* determine lens of each row */
 95:   for (i=0; i<nrows; i++) {
 96:     kstart  = ai[irow[i]];
 97:     kend    = kstart + a->ilen[irow[i]];
 98:     lens[i] = 0;
 99:     for (k=kstart; k<kend; k++) {
100:       if (ssmap[aj[k]]) lens[i]++;
101:     }
102:   }
103:   /* Create and fill new matrix */
104:   if (scall == MAT_REUSE_MATRIX) {
105:     c = (Mat_SeqBAIJ*)((*B)->data);

107:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
109:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
111:     C    = *B;
112:   } else {
113:     MatCreate(PetscObjectComm((PetscObject)A),&C);
114:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
115:     MatSetType(C,((PetscObject)A)->type_name);
116:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
117:   }
118:   c = (Mat_SeqBAIJ*)(C->data);
119:   for (i=0; i<nrows; i++) {
120:     row      = irow[i];
121:     kstart   = ai[row];
122:     kend     = kstart + a->ilen[row];
123:     mat_i    = c->i[i];
124:     mat_j    = c->j + mat_i;
125:     mat_a    = c->a + mat_i*bs2;
126:     mat_ilen = c->ilen + i;
127:     for (k=kstart; k<kend; k++) {
128:       if ((tcol=ssmap[a->j[k]])) {
129:         *mat_j++ = tcol - 1;
130:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
131:         mat_a   += bs2;
132:         (*mat_ilen)++;
133:       }
134:     }
135:   }
136:   /* sort */
137:   {
138:     MatScalar *work;
139:     PetscMalloc1(bs2,&work);
140:     for (i=0; i<nrows; i++) {
141:       PetscInt ilen;
142:       mat_i = c->i[i];
143:       mat_j = c->j + mat_i;
144:       mat_a = c->a + mat_i*bs2;
145:       ilen  = c->ilen[i];
146:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
147:     }
148:     PetscFree(work);
149:   }

151:   /* Free work space */
152:   ISRestoreIndices(iscol,&icol);
153:   PetscFree(smap);
154:   PetscFree(lens);
155:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

158:   ISRestoreIndices(isrow,&irow);
159:   *B   = C;
160:   return(0);
161: }

163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
168:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169:   const PetscInt *irow,*icol;

172:   ISGetIndices(isrow,&irow);
173:   ISGetIndices(iscol,&icol);
174:   ISGetLocalSize(isrow,&nrows);
175:   ISGetLocalSize(iscol,&ncols);

177:   /* Verify if the indices corespond to each element in a block
178:    and form the IS with compressed IS */
179:   maxmnbs = PetscMax(a->mbs,a->nbs);
180:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
181:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
182:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183:   for (i=0; i<a->mbs; i++) {
184:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185:   }
186:   count = 0;
187:   for (i=0; i<nrows; i++) {
188:     j = irow[i] / bs;
189:     if ((vary[j]--)==bs) iary[count++] = j;
190:   }
191:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

193:   PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
194:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195:   for (i=0; i<a->nbs; i++) {
196:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197:   }
198:   count = 0;
199:   for (i=0; i<ncols; i++) {
200:     j = icol[i] / bs;
201:     if ((vary[j]--)==bs) iary[count++] = j;
202:   }
203:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
204:   ISRestoreIndices(isrow,&irow);
205:   ISRestoreIndices(iscol,&icol);
206:   PetscFree2(vary,iary);

208:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
209:   ISDestroy(&is1);
210:   ISDestroy(&is2);
211:   return(0);
212: }

214: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
215: {
217:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218:   Mat_SubSppt    *submatj = c->submatis1;

221:   submatj->destroy(C);
222:   MatDestroySubMatrix_Private(submatj);
223:   return(0);
224: }

226: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
227: {
229:   PetscInt       i;
230:   Mat            C;
231:   Mat_SeqBAIJ    *c;
232:   Mat_SubSppt    *submatj;

235:   for (i=0; i<n; i++) {
236:     C       = (*mat)[i];
237:     c       = (Mat_SeqBAIJ*)C->data;
238:     submatj = c->submatis1;
239:     if (submatj) {
240:       submatj->destroy(C);
241:       MatDestroySubMatrix_Private(submatj);
242:       PetscLayoutDestroy(&C->rmap);
243:       PetscLayoutDestroy(&C->cmap);
244:       PetscHeaderDestroy(&C);
245:     } else {
246:       MatDestroy(&C);
247:     }
248:   }
249:   PetscFree(*mat);
250:   return(0);
251: }

253: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
254: {
256:   PetscInt       i;

259:   if (scall == MAT_INITIAL_MATRIX) {
260:     PetscCalloc1(n+1,B);
261:   }

263:   for (i=0; i<n; i++) {
264:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
265:   }
266:   return(0);
267: }

269: /* -------------------------------------------------------*/
270: /* Should check that shapes of vectors and matrices match */
271: /* -------------------------------------------------------*/

273: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
274: {
275:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
276:   PetscScalar       *z,sum;
277:   const PetscScalar *x;
278:   const MatScalar   *v;
279:   PetscErrorCode    ierr;
280:   PetscInt          mbs,i,n;
281:   const PetscInt    *idx,*ii,*ridx=NULL;
282:   PetscBool         usecprow=a->compressedrow.use;

285:   VecGetArrayRead(xx,&x);
286:   VecGetArray(zz,&z);

288:   if (usecprow) {
289:     mbs  = a->compressedrow.nrows;
290:     ii   = a->compressedrow.i;
291:     ridx = a->compressedrow.rindex;
292:     PetscMemzero(z,a->mbs*sizeof(PetscScalar));
293:   } else {
294:     mbs = a->mbs;
295:     ii  = a->i;
296:   }

298:   for (i=0; i<mbs; i++) {
299:     n   = ii[1] - ii[0];
300:     v   = a->a + ii[0];
301:     idx = a->j + ii[0];
302:     ii++;
303:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
304:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
305:     sum = 0.0;
306:     PetscSparseDensePlusDot(sum,x,v,idx,n);
307:     if (usecprow) {
308:       z[ridx[i]] = sum;
309:     } else {
310:       z[i]       = sum;
311:     }
312:   }
313:   VecRestoreArrayRead(xx,&x);
314:   VecRestoreArray(zz,&z);
315:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
316:   return(0);
317: }

319: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
320: {
321:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
322:   PetscScalar       *z = 0,sum1,sum2,*zarray;
323:   const PetscScalar *x,*xb;
324:   PetscScalar       x1,x2;
325:   const MatScalar   *v;
326:   PetscErrorCode    ierr;
327:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
328:   PetscBool         usecprow=a->compressedrow.use;

331:   VecGetArrayRead(xx,&x);
332:   VecGetArray(zz,&zarray);

334:   idx = a->j;
335:   v   = a->a;
336:   if (usecprow) {
337:     mbs  = a->compressedrow.nrows;
338:     ii   = a->compressedrow.i;
339:     ridx = a->compressedrow.rindex;
340:     PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
341:   } else {
342:     mbs = a->mbs;
343:     ii  = a->i;
344:     z   = zarray;
345:   }

347:   for (i=0; i<mbs; i++) {
348:     n           = ii[1] - ii[0]; ii++;
349:     sum1        = 0.0; sum2 = 0.0;
350:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
351:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
352:     for (j=0; j<n; j++) {
353:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
354:       sum1 += v[0]*x1 + v[2]*x2;
355:       sum2 += v[1]*x1 + v[3]*x2;
356:       v    += 4;
357:     }
358:     if (usecprow) z = zarray + 2*ridx[i];
359:     z[0] = sum1; z[1] = sum2;
360:     if (!usecprow) z += 2;
361:   }
362:   VecRestoreArrayRead(xx,&x);
363:   VecRestoreArray(zz,&zarray);
364:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
365:   return(0);
366: }

368: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
369: {
370:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
371:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
372:   const PetscScalar *x,*xb;
373:   const MatScalar   *v;
374:   PetscErrorCode    ierr;
375:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
376:   PetscBool         usecprow=a->compressedrow.use;

378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*v,*z,*xb)
380: #endif

383:   VecGetArrayRead(xx,&x);
384:   VecGetArray(zz,&zarray);

386:   idx = a->j;
387:   v   = a->a;
388:   if (usecprow) {
389:     mbs  = a->compressedrow.nrows;
390:     ii   = a->compressedrow.i;
391:     ridx = a->compressedrow.rindex;
392:     PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
393:   } else {
394:     mbs = a->mbs;
395:     ii  = a->i;
396:     z   = zarray;
397:   }

399:   for (i=0; i<mbs; i++) {
400:     n           = ii[1] - ii[0]; ii++;
401:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
402:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
403:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404:     for (j=0; j<n; j++) {
405:       xb = x + 3*(*idx++);
406:       x1 = xb[0];
407:       x2 = xb[1];
408:       x3 = xb[2];

410:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
411:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
412:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
413:       v    += 9;
414:     }
415:     if (usecprow) z = zarray + 3*ridx[i];
416:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
417:     if (!usecprow) z += 3;
418:   }
419:   VecRestoreArrayRead(xx,&x);
420:   VecRestoreArray(zz,&zarray);
421:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
422:   return(0);
423: }

425: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
426: {
427:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
428:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
429:   const PetscScalar *x,*xb;
430:   const MatScalar   *v;
431:   PetscErrorCode    ierr;
432:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
433:   PetscBool         usecprow=a->compressedrow.use;

436:   VecGetArrayRead(xx,&x);
437:   VecGetArray(zz,&zarray);

439:   idx = a->j;
440:   v   = a->a;
441:   if (usecprow) {
442:     mbs  = a->compressedrow.nrows;
443:     ii   = a->compressedrow.i;
444:     ridx = a->compressedrow.rindex;
445:     PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
446:   } else {
447:     mbs = a->mbs;
448:     ii  = a->i;
449:     z   = zarray;
450:   }

452:   for (i=0; i<mbs; i++) {
453:     n = ii[1] - ii[0];
454:     ii++;
455:     sum1 = 0.0;
456:     sum2 = 0.0;
457:     sum3 = 0.0;
458:     sum4 = 0.0;

460:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
461:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
462:     for (j=0; j<n; j++) {
463:       xb    = x + 4*(*idx++);
464:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
465:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
466:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
467:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
468:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
469:       v    += 16;
470:     }
471:     if (usecprow) z = zarray + 4*ridx[i];
472:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
473:     if (!usecprow) z += 4;
474:   }
475:   VecRestoreArrayRead(xx,&x);
476:   VecRestoreArray(zz,&zarray);
477:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
478:   return(0);
479: }

481: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
482: {
483:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
484:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
485:   const PetscScalar *xb,*x;
486:   const MatScalar   *v;
487:   PetscErrorCode    ierr;
488:   const PetscInt    *idx,*ii,*ridx=NULL;
489:   PetscInt          mbs,i,j,n;
490:   PetscBool         usecprow=a->compressedrow.use;

493:   VecGetArrayRead(xx,&x);
494:   VecGetArray(zz,&zarray);

496:   idx = a->j;
497:   v   = a->a;
498:   if (usecprow) {
499:     mbs  = a->compressedrow.nrows;
500:     ii   = a->compressedrow.i;
501:     ridx = a->compressedrow.rindex;
502:     PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
503:   } else {
504:     mbs = a->mbs;
505:     ii  = a->i;
506:     z   = zarray;
507:   }

509:   for (i=0; i<mbs; i++) {
510:     n           = ii[1] - ii[0]; ii++;
511:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
512:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
513:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
514:     for (j=0; j<n; j++) {
515:       xb    = x + 5*(*idx++);
516:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
517:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
518:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
519:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
520:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
521:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
522:       v    += 25;
523:     }
524:     if (usecprow) z = zarray + 5*ridx[i];
525:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
526:     if (!usecprow) z += 5;
527:   }
528:   VecRestoreArrayRead(xx,&x);
529:   VecRestoreArray(zz,&zarray);
530:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
531:   return(0);
532: }



536: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
537: {
538:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
539:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
540:   const PetscScalar *x,*xb;
541:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
542:   const MatScalar   *v;
543:   PetscErrorCode    ierr;
544:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
545:   PetscBool         usecprow=a->compressedrow.use;

548:   VecGetArrayRead(xx,&x);
549:   VecGetArray(zz,&zarray);

551:   idx = a->j;
552:   v   = a->a;
553:   if (usecprow) {
554:     mbs  = a->compressedrow.nrows;
555:     ii   = a->compressedrow.i;
556:     ridx = a->compressedrow.rindex;
557:     PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
558:   } else {
559:     mbs = a->mbs;
560:     ii  = a->i;
561:     z   = zarray;
562:   }

564:   for (i=0; i<mbs; i++) {
565:     n  = ii[1] - ii[0];
566:     ii++;
567:     sum1 = 0.0;
568:     sum2 = 0.0;
569:     sum3 = 0.0;
570:     sum4 = 0.0;
571:     sum5 = 0.0;
572:     sum6 = 0.0;

574:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
575:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
576:     for (j=0; j<n; j++) {
577:       xb    = x + 6*(*idx++);
578:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
579:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
580:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
581:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
582:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
583:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
584:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
585:       v    += 36;
586:     }
587:     if (usecprow) z = zarray + 6*ridx[i];
588:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
589:     if (!usecprow) z += 6;
590:   }

592:   VecRestoreArrayRead(xx,&x);
593:   VecRestoreArray(zz,&zarray);
594:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
595:   return(0);
596: }

598: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
599: {
600:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
601:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
602:   const PetscScalar *x,*xb;
603:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
604:   const MatScalar   *v;
605:   PetscErrorCode    ierr;
606:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
607:   PetscBool         usecprow=a->compressedrow.use;

610:   VecGetArrayRead(xx,&x);
611:   VecGetArray(zz,&zarray);

613:   idx = a->j;
614:   v   = a->a;
615:   if (usecprow) {
616:     mbs  = a->compressedrow.nrows;
617:     ii   = a->compressedrow.i;
618:     ridx = a->compressedrow.rindex;
619:     PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
620:   } else {
621:     mbs = a->mbs;
622:     ii  = a->i;
623:     z   = zarray;
624:   }

626:   for (i=0; i<mbs; i++) {
627:     n  = ii[1] - ii[0];
628:     ii++;
629:     sum1 = 0.0;
630:     sum2 = 0.0;
631:     sum3 = 0.0;
632:     sum4 = 0.0;
633:     sum5 = 0.0;
634:     sum6 = 0.0;
635:     sum7 = 0.0;

637:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
638:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
639:     for (j=0; j<n; j++) {
640:       xb    = x + 7*(*idx++);
641:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
642:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
643:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
644:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
645:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
646:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
647:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
648:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
649:       v    += 49;
650:     }
651:     if (usecprow) z = zarray + 7*ridx[i];
652:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
653:     if (!usecprow) z += 7;
654:   }

656:   VecRestoreArrayRead(xx,&x);
657:   VecRestoreArray(zz,&zarray);
658:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
659:   return(0);
660: }

662: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
663: {
664:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
665:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
666:   const PetscScalar *x,*xb;
667:   PetscScalar       *zarray,xv;
668:   const MatScalar   *v;
669:   PetscErrorCode    ierr;
670:   const PetscInt    *ii,*ij=a->j,*idx;
671:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
672:   PetscBool         usecprow=a->compressedrow.use;

675:   VecGetArrayRead(xx,&x);
676:   VecGetArray(zz,&zarray);

678:   v = a->a;
679:   if (usecprow) {
680:     mbs  = a->compressedrow.nrows;
681:     ii   = a->compressedrow.i;
682:     ridx = a->compressedrow.rindex;
683:     PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));
684:   } else {
685:     mbs = a->mbs;
686:     ii  = a->i;
687:     z   = zarray;
688:   }

690:   for (i=0; i<mbs; i++) {
691:     n    = ii[i+1] - ii[i];
692:     idx  = ij + ii[i];
693:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
694:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

696:     for (j=0; j<n; j++) {
697:       xb = x + 11*(idx[j]);

699:       for (k=0; k<11; k++) {
700:         xv     =  xb[k];
701:         sum1  += v[0]*xv;
702:         sum2  += v[1]*xv;
703:         sum3  += v[2]*xv;
704:         sum4  += v[3]*xv;
705:         sum5  += v[4]*xv;
706:         sum6  += v[5]*xv;
707:         sum7  += v[6]*xv;
708:         sum8  += v[7]*xv;
709:         sum9  += v[8]*xv;
710:         sum10 += v[9]*xv;
711:         sum11 += v[10]*xv;
712:         v     += 11;
713:       }
714:     }
715:     if (usecprow) z = zarray + 11*ridx[i];
716:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
717:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

719:     if (!usecprow) z += 11;
720:   }

722:   VecRestoreArrayRead(xx,&x);
723:   VecRestoreArray(zz,&zarray);
724:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
725:   return(0);
726: }

728: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
729: /* Default MatMult for block size 15 */

731: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
732: {
733:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
734:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
735:   const PetscScalar *x,*xb;
736:   PetscScalar       *zarray,xv;
737:   const MatScalar   *v;
738:   PetscErrorCode    ierr;
739:   const PetscInt    *ii,*ij=a->j,*idx;
740:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
741:   PetscBool         usecprow=a->compressedrow.use;

744:   VecGetArrayRead(xx,&x);
745:   VecGetArray(zz,&zarray);

747:   v = a->a;
748:   if (usecprow) {
749:     mbs  = a->compressedrow.nrows;
750:     ii   = a->compressedrow.i;
751:     ridx = a->compressedrow.rindex;
752:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
753:   } else {
754:     mbs = a->mbs;
755:     ii  = a->i;
756:     z   = zarray;
757:   }

759:   for (i=0; i<mbs; i++) {
760:     n    = ii[i+1] - ii[i];
761:     idx  = ij + ii[i];
762:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
763:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

765:     for (j=0; j<n; j++) {
766:       xb = x + 15*(idx[j]);

768:       for (k=0; k<15; k++) {
769:         xv     =  xb[k];
770:         sum1  += v[0]*xv;
771:         sum2  += v[1]*xv;
772:         sum3  += v[2]*xv;
773:         sum4  += v[3]*xv;
774:         sum5  += v[4]*xv;
775:         sum6  += v[5]*xv;
776:         sum7  += v[6]*xv;
777:         sum8  += v[7]*xv;
778:         sum9  += v[8]*xv;
779:         sum10 += v[9]*xv;
780:         sum11 += v[10]*xv;
781:         sum12 += v[11]*xv;
782:         sum13 += v[12]*xv;
783:         sum14 += v[13]*xv;
784:         sum15 += v[14]*xv;
785:         v     += 15;
786:       }
787:     }
788:     if (usecprow) z = zarray + 15*ridx[i];
789:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

792:     if (!usecprow) z += 15;
793:   }

795:   VecRestoreArrayRead(xx,&x);
796:   VecRestoreArray(zz,&zarray);
797:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
798:   return(0);
799: }

801: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
802: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
803: {
804:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
805:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
806:   const PetscScalar *x,*xb;
807:   PetscScalar       x1,x2,x3,x4,*zarray;
808:   const MatScalar   *v;
809:   PetscErrorCode    ierr;
810:   const PetscInt    *ii,*ij=a->j,*idx;
811:   PetscInt          mbs,i,j,n,*ridx=NULL;
812:   PetscBool         usecprow=a->compressedrow.use;

815:   VecGetArrayRead(xx,&x);
816:   VecGetArray(zz,&zarray);

818:   v = a->a;
819:   if (usecprow) {
820:     mbs  = a->compressedrow.nrows;
821:     ii   = a->compressedrow.i;
822:     ridx = a->compressedrow.rindex;
823:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
824:   } else {
825:     mbs = a->mbs;
826:     ii  = a->i;
827:     z   = zarray;
828:   }

830:   for (i=0; i<mbs; i++) {
831:     n    = ii[i+1] - ii[i];
832:     idx  = ij + ii[i];
833:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
834:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

836:     for (j=0; j<n; j++) {
837:       xb = x + 15*(idx[j]);
838:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

840:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
841:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
842:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
843:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
844:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
845:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
846:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
847:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
848:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
849:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
850:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
851:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
852:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
853:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
854:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

856:       v += 60;

858:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

860:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
861:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
862:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
863:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
864:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
865:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
866:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
867:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
868:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
869:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
870:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
871:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
872:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
873:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
874:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
875:       v     += 60;

877:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
878:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
879:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
880:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
881:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
882:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
883:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
884:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
885:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
886:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
887:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
888:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
889:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
890:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
891:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
892:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
893:       v     += 60;

895:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
896:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
897:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
898:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
899:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
900:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
901:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
902:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
903:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
904:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
905:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
906:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
907:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
908:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
909:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
910:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
911:       v     += 45;
912:     }
913:     if (usecprow) z = zarray + 15*ridx[i];
914:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

917:     if (!usecprow) z += 15;
918:   }

920:   VecRestoreArrayRead(xx,&x);
921:   VecRestoreArray(zz,&zarray);
922:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
923:   return(0);
924: }

926: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
927: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
928: {
929:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931:   const PetscScalar *x,*xb;
932:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
933:   const MatScalar   *v;
934:   PetscErrorCode    ierr;
935:   const PetscInt    *ii,*ij=a->j,*idx;
936:   PetscInt          mbs,i,j,n,*ridx=NULL;
937:   PetscBool         usecprow=a->compressedrow.use;

940:   VecGetArrayRead(xx,&x);
941:   VecGetArray(zz,&zarray);

943:   v = a->a;
944:   if (usecprow) {
945:     mbs  = a->compressedrow.nrows;
946:     ii   = a->compressedrow.i;
947:     ridx = a->compressedrow.rindex;
948:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
949:   } else {
950:     mbs = a->mbs;
951:     ii  = a->i;
952:     z   = zarray;
953:   }

955:   for (i=0; i<mbs; i++) {
956:     n    = ii[i+1] - ii[i];
957:     idx  = ij + ii[i];
958:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

961:     for (j=0; j<n; j++) {
962:       xb = x + 15*(idx[j]);
963:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964:       x8 = xb[7];

966:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
967:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
968:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
969:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
970:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
971:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
972:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
973:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
974:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
975:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
976:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
977:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
978:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
979:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
980:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
981:       v     += 120;

983:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

985:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
986:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
987:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
988:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
989:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
990:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
991:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
992:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
993:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
994:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
995:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
996:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
997:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
998:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
999:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1000:       v     += 105;
1001:     }
1002:     if (usecprow) z = zarray + 15*ridx[i];
1003:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1004:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1006:     if (!usecprow) z += 15;
1007:   }

1009:   VecRestoreArrayRead(xx,&x);
1010:   VecRestoreArray(zz,&zarray);
1011:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1012:   return(0);
1013: }

1015: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

1017: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1018: {
1019:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1020:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1021:   const PetscScalar *x,*xb;
1022:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1023:   const MatScalar   *v;
1024:   PetscErrorCode    ierr;
1025:   const PetscInt    *ii,*ij=a->j,*idx;
1026:   PetscInt          mbs,i,j,n,*ridx=NULL;
1027:   PetscBool         usecprow=a->compressedrow.use;

1030:   VecGetArrayRead(xx,&x);
1031:   VecGetArray(zz,&zarray);

1033:   v = a->a;
1034:   if (usecprow) {
1035:     mbs  = a->compressedrow.nrows;
1036:     ii   = a->compressedrow.i;
1037:     ridx = a->compressedrow.rindex;
1038:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1039:   } else {
1040:     mbs = a->mbs;
1041:     ii  = a->i;
1042:     z   = zarray;
1043:   }

1045:   for (i=0; i<mbs; i++) {
1046:     n    = ii[i+1] - ii[i];
1047:     idx  = ij + ii[i];
1048:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1049:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1051:     for (j=0; j<n; j++) {
1052:       xb = x + 15*(idx[j]);
1053:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1054:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1056:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1057:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1058:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1059:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1060:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1061:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1062:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1063:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1064:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1065:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1066:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1067:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1068:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1069:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1070:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1071:       v     += 225;
1072:     }
1073:     if (usecprow) z = zarray + 15*ridx[i];
1074:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1075:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1077:     if (!usecprow) z += 15;
1078:   }

1080:   VecRestoreArrayRead(xx,&x);
1081:   VecRestoreArray(zz,&zarray);
1082:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1083:   return(0);
1084: }


1087: /*
1088:     This will not work with MatScalar == float because it calls the BLAS
1089: */
1090: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1091: {
1092:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1093:   PetscScalar       *z = 0,*work,*workt,*zarray;
1094:   const PetscScalar *x,*xb;
1095:   const MatScalar   *v;
1096:   PetscErrorCode    ierr;
1097:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1098:   const PetscInt    *idx,*ii,*ridx=NULL;
1099:   PetscInt          ncols,k;
1100:   PetscBool         usecprow=a->compressedrow.use;

1103:   VecGetArrayRead(xx,&x);
1104:   VecGetArray(zz,&zarray);

1106:   idx = a->j;
1107:   v   = a->a;
1108:   if (usecprow) {
1109:     mbs  = a->compressedrow.nrows;
1110:     ii   = a->compressedrow.i;
1111:     ridx = a->compressedrow.rindex;
1112:     PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1113:   } else {
1114:     mbs = a->mbs;
1115:     ii  = a->i;
1116:     z   = zarray;
1117:   }

1119:   if (!a->mult_work) {
1120:     k    = PetscMax(A->rmap->n,A->cmap->n);
1121:     PetscMalloc1(k+1,&a->mult_work);
1122:   }
1123:   work = a->mult_work;
1124:   for (i=0; i<mbs; i++) {
1125:     n           = ii[1] - ii[0]; ii++;
1126:     ncols       = n*bs;
1127:     workt       = work;
1128:     for (j=0; j<n; j++) {
1129:       xb = x + bs*(*idx++);
1130:       for (k=0; k<bs; k++) workt[k] = xb[k];
1131:       workt += bs;
1132:     }
1133:     if (usecprow) z = zarray + bs*ridx[i];
1134:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1135:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1136:     v += n*bs2;
1137:     if (!usecprow) z += bs;
1138:   }
1139:   VecRestoreArrayRead(xx,&x);
1140:   VecRestoreArray(zz,&zarray);
1141:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1142:   return(0);
1143: }

1145: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1146: {
1147:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1148:   const PetscScalar *x;
1149:   PetscScalar       *y,*z,sum;
1150:   const MatScalar   *v;
1151:   PetscErrorCode    ierr;
1152:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1153:   const PetscInt    *idx,*ii;
1154:   PetscBool         usecprow=a->compressedrow.use;

1157:   VecGetArrayRead(xx,&x);
1158:   VecGetArrayPair(yy,zz,&y,&z);

1160:   idx = a->j;
1161:   v   = a->a;
1162:   if (usecprow) {
1163:     if (zz != yy) {
1164:       PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1165:     }
1166:     mbs  = a->compressedrow.nrows;
1167:     ii   = a->compressedrow.i;
1168:     ridx = a->compressedrow.rindex;
1169:   } else {
1170:     ii = a->i;
1171:   }

1173:   for (i=0; i<mbs; i++) {
1174:     n = ii[1] - ii[0];
1175:     ii++;
1176:     if (!usecprow) {
1177:       sum         = y[i];
1178:     } else {
1179:       sum = y[ridx[i]];
1180:     }
1181:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1182:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1183:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1184:     v   += n;
1185:     idx += n;
1186:     if (usecprow) {
1187:       z[ridx[i]] = sum;
1188:     } else {
1189:       z[i] = sum;
1190:     }
1191:   }
1192:   VecRestoreArrayRead(xx,&x);
1193:   VecRestoreArrayPair(yy,zz,&y,&z);
1194:   PetscLogFlops(2.0*a->nz);
1195:   return(0);
1196: }

1198: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1199: {
1200:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1201:   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1202:   const PetscScalar *x,*xb;
1203:   PetscScalar       x1,x2,*yarray,*zarray;
1204:   const MatScalar   *v;
1205:   PetscErrorCode    ierr;
1206:   PetscInt          mbs = a->mbs,i,n,j;
1207:   const PetscInt    *idx,*ii,*ridx = NULL;
1208:   PetscBool         usecprow = a->compressedrow.use;

1211:   VecGetArrayRead(xx,&x);
1212:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1214:   idx = a->j;
1215:   v   = a->a;
1216:   if (usecprow) {
1217:     if (zz != yy) {
1218:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1219:     }
1220:     mbs  = a->compressedrow.nrows;
1221:     ii   = a->compressedrow.i;
1222:     ridx = a->compressedrow.rindex;
1223:   } else {
1224:     ii = a->i;
1225:     y  = yarray;
1226:     z  = zarray;
1227:   }

1229:   for (i=0; i<mbs; i++) {
1230:     n = ii[1] - ii[0]; ii++;
1231:     if (usecprow) {
1232:       z = zarray + 2*ridx[i];
1233:       y = yarray + 2*ridx[i];
1234:     }
1235:     sum1 = y[0]; sum2 = y[1];
1236:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1237:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1238:     for (j=0; j<n; j++) {
1239:       xb = x + 2*(*idx++);
1240:       x1 = xb[0];
1241:       x2 = xb[1];

1243:       sum1 += v[0]*x1 + v[2]*x2;
1244:       sum2 += v[1]*x1 + v[3]*x2;
1245:       v    += 4;
1246:     }
1247:     z[0] = sum1; z[1] = sum2;
1248:     if (!usecprow) {
1249:       z += 2; y += 2;
1250:     }
1251:   }
1252:   VecRestoreArrayRead(xx,&x);
1253:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1254:   PetscLogFlops(4.0*a->nz);
1255:   return(0);
1256: }

1258: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1259: {
1260:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1261:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1262:   const PetscScalar *x,*xb;
1263:   const MatScalar   *v;
1264:   PetscErrorCode    ierr;
1265:   PetscInt          mbs = a->mbs,i,j,n;
1266:   const PetscInt    *idx,*ii,*ridx = NULL;
1267:   PetscBool         usecprow = a->compressedrow.use;

1270:   VecGetArrayRead(xx,&x);
1271:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1273:   idx = a->j;
1274:   v   = a->a;
1275:   if (usecprow) {
1276:     if (zz != yy) {
1277:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1278:     }
1279:     mbs  = a->compressedrow.nrows;
1280:     ii   = a->compressedrow.i;
1281:     ridx = a->compressedrow.rindex;
1282:   } else {
1283:     ii = a->i;
1284:     y  = yarray;
1285:     z  = zarray;
1286:   }

1288:   for (i=0; i<mbs; i++) {
1289:     n = ii[1] - ii[0]; ii++;
1290:     if (usecprow) {
1291:       z = zarray + 3*ridx[i];
1292:       y = yarray + 3*ridx[i];
1293:     }
1294:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1295:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1296:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297:     for (j=0; j<n; j++) {
1298:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1299:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1300:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1301:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1302:       v    += 9;
1303:     }
1304:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1305:     if (!usecprow) {
1306:       z += 3; y += 3;
1307:     }
1308:   }
1309:   VecRestoreArrayRead(xx,&x);
1310:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1311:   PetscLogFlops(18.0*a->nz);
1312:   return(0);
1313: }

1315: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1316: {
1317:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1318:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1319:   const PetscScalar *x,*xb;
1320:   const MatScalar   *v;
1321:   PetscErrorCode    ierr;
1322:   PetscInt          mbs = a->mbs,i,j,n;
1323:   const PetscInt    *idx,*ii,*ridx=NULL;
1324:   PetscBool         usecprow=a->compressedrow.use;

1327:   VecGetArrayRead(xx,&x);
1328:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1330:   idx = a->j;
1331:   v   = a->a;
1332:   if (usecprow) {
1333:     if (zz != yy) {
1334:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1335:     }
1336:     mbs  = a->compressedrow.nrows;
1337:     ii   = a->compressedrow.i;
1338:     ridx = a->compressedrow.rindex;
1339:   } else {
1340:     ii = a->i;
1341:     y  = yarray;
1342:     z  = zarray;
1343:   }

1345:   for (i=0; i<mbs; i++) {
1346:     n = ii[1] - ii[0]; ii++;
1347:     if (usecprow) {
1348:       z = zarray + 4*ridx[i];
1349:       y = yarray + 4*ridx[i];
1350:     }
1351:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1352:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1353:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1354:     for (j=0; j<n; j++) {
1355:       xb    = x + 4*(*idx++);
1356:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1357:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1358:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1359:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1360:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1361:       v    += 16;
1362:     }
1363:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1364:     if (!usecprow) {
1365:       z += 4; y += 4;
1366:     }
1367:   }
1368:   VecRestoreArrayRead(xx,&x);
1369:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1370:   PetscLogFlops(32.0*a->nz);
1371:   return(0);
1372: }

1374: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1375: {
1376:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1377:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1378:   const PetscScalar *x,*xb;
1379:   PetscScalar       *yarray,*zarray;
1380:   const MatScalar   *v;
1381:   PetscErrorCode    ierr;
1382:   PetscInt          mbs = a->mbs,i,j,n;
1383:   const PetscInt    *idx,*ii,*ridx = NULL;
1384:   PetscBool         usecprow=a->compressedrow.use;

1387:   VecGetArrayRead(xx,&x);
1388:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1390:   idx = a->j;
1391:   v   = a->a;
1392:   if (usecprow) {
1393:     if (zz != yy) {
1394:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1395:     }
1396:     mbs  = a->compressedrow.nrows;
1397:     ii   = a->compressedrow.i;
1398:     ridx = a->compressedrow.rindex;
1399:   } else {
1400:     ii = a->i;
1401:     y  = yarray;
1402:     z  = zarray;
1403:   }

1405:   for (i=0; i<mbs; i++) {
1406:     n = ii[1] - ii[0]; ii++;
1407:     if (usecprow) {
1408:       z = zarray + 5*ridx[i];
1409:       y = yarray + 5*ridx[i];
1410:     }
1411:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1412:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1413:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1414:     for (j=0; j<n; j++) {
1415:       xb    = x + 5*(*idx++);
1416:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1417:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1418:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1419:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1420:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1421:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1422:       v    += 25;
1423:     }
1424:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1425:     if (!usecprow) {
1426:       z += 5; y += 5;
1427:     }
1428:   }
1429:   VecRestoreArrayRead(xx,&x);
1430:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1431:   PetscLogFlops(50.0*a->nz);
1432:   return(0);
1433: }
1434: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1435: {
1436:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1437:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1438:   const PetscScalar *x,*xb;
1439:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1440:   const MatScalar   *v;
1441:   PetscErrorCode    ierr;
1442:   PetscInt          mbs = a->mbs,i,j,n;
1443:   const PetscInt    *idx,*ii,*ridx=NULL;
1444:   PetscBool         usecprow=a->compressedrow.use;

1447:   VecGetArrayRead(xx,&x);
1448:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1450:   idx = a->j;
1451:   v   = a->a;
1452:   if (usecprow) {
1453:     if (zz != yy) {
1454:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1455:     }
1456:     mbs  = a->compressedrow.nrows;
1457:     ii   = a->compressedrow.i;
1458:     ridx = a->compressedrow.rindex;
1459:   } else {
1460:     ii = a->i;
1461:     y  = yarray;
1462:     z  = zarray;
1463:   }

1465:   for (i=0; i<mbs; i++) {
1466:     n = ii[1] - ii[0]; ii++;
1467:     if (usecprow) {
1468:       z = zarray + 6*ridx[i];
1469:       y = yarray + 6*ridx[i];
1470:     }
1471:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1472:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1473:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1474:     for (j=0; j<n; j++) {
1475:       xb    = x + 6*(*idx++);
1476:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1477:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1478:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1479:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1480:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1481:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1482:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1483:       v    += 36;
1484:     }
1485:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1486:     if (!usecprow) {
1487:       z += 6; y += 6;
1488:     }
1489:   }
1490:   VecRestoreArrayRead(xx,&x);
1491:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1492:   PetscLogFlops(72.0*a->nz);
1493:   return(0);
1494: }

1496: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1497: {
1498:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1499:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1500:   const PetscScalar *x,*xb;
1501:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1502:   const MatScalar   *v;
1503:   PetscErrorCode    ierr;
1504:   PetscInt          mbs = a->mbs,i,j,n;
1505:   const PetscInt    *idx,*ii,*ridx = NULL;
1506:   PetscBool         usecprow=a->compressedrow.use;

1509:   VecGetArrayRead(xx,&x);
1510:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1512:   idx = a->j;
1513:   v   = a->a;
1514:   if (usecprow) {
1515:     if (zz != yy) {
1516:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1517:     }
1518:     mbs  = a->compressedrow.nrows;
1519:     ii   = a->compressedrow.i;
1520:     ridx = a->compressedrow.rindex;
1521:   } else {
1522:     ii = a->i;
1523:     y  = yarray;
1524:     z  = zarray;
1525:   }

1527:   for (i=0; i<mbs; i++) {
1528:     n = ii[1] - ii[0]; ii++;
1529:     if (usecprow) {
1530:       z = zarray + 7*ridx[i];
1531:       y = yarray + 7*ridx[i];
1532:     }
1533:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1534:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1535:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536:     for (j=0; j<n; j++) {
1537:       xb    = x + 7*(*idx++);
1538:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1539:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1540:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1541:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1542:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1543:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1544:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1545:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1546:       v    += 49;
1547:     }
1548:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1549:     if (!usecprow) {
1550:       z += 7; y += 7;
1551:     }
1552:   }
1553:   VecRestoreArrayRead(xx,&x);
1554:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1555:   PetscLogFlops(98.0*a->nz);
1556:   return(0);
1557: }

1559: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1560: {
1561:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1562:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1563:   const PetscScalar *x,*xb;
1564:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1565:   const MatScalar   *v;
1566:   PetscErrorCode    ierr;
1567:   PetscInt          mbs = a->mbs,i,j,n;
1568:   const PetscInt    *idx,*ii,*ridx = NULL;
1569:   PetscBool         usecprow=a->compressedrow.use;

1572:   VecGetArrayRead(xx,&x);
1573:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1575:   idx = a->j;
1576:   v   = a->a;
1577:   if (usecprow) {
1578:     if (zz != yy) {
1579:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1580:     }
1581:     mbs  = a->compressedrow.nrows;
1582:     ii   = a->compressedrow.i;
1583:     ridx = a->compressedrow.rindex;
1584:   } else {
1585:     ii = a->i;
1586:     y  = yarray;
1587:     z  = zarray;
1588:   }

1590:   for (i=0; i<mbs; i++) {
1591:     n = ii[1] - ii[0]; ii++;
1592:     if (usecprow) {
1593:       z = zarray + 11*ridx[i];
1594:       y = yarray + 11*ridx[i];
1595:     }
1596:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1597:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1598:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1599:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600:     for (j=0; j<n; j++) {
1601:       xb    = x + 11*(*idx++);
1602:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1603:       sum1 += v[0]*x1 + v[11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1604:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1605:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1606:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1607:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1608:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1609:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1610:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1611:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1612:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1613:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1614:       v    += 121;
1615:     }
1616:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1617:     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1618:     if (!usecprow) {
1619:       z += 11; y += 11;
1620:     }
1621:   }
1622:   VecRestoreArrayRead(xx,&x);
1623:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1624:   PetscLogFlops(242.0*a->nz);
1625:   return(0);
1626: }

1628: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1629: {
1630:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1631:   PetscScalar       *z = 0,*work,*workt,*zarray;
1632:   const PetscScalar *x,*xb;
1633:   const MatScalar   *v;
1634:   PetscErrorCode    ierr;
1635:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1636:   PetscInt          ncols,k;
1637:   const PetscInt    *ridx = NULL,*idx,*ii;
1638:   PetscBool         usecprow = a->compressedrow.use;

1641:   VecCopy(yy,zz);
1642:   VecGetArrayRead(xx,&x);
1643:   VecGetArray(zz,&zarray);

1645:   idx = a->j;
1646:   v   = a->a;
1647:   if (usecprow) {
1648:     mbs  = a->compressedrow.nrows;
1649:     ii   = a->compressedrow.i;
1650:     ridx = a->compressedrow.rindex;
1651:   } else {
1652:     mbs = a->mbs;
1653:     ii  = a->i;
1654:     z   = zarray;
1655:   }

1657:   if (!a->mult_work) {
1658:     k    = PetscMax(A->rmap->n,A->cmap->n);
1659:     PetscMalloc1(k+1,&a->mult_work);
1660:   }
1661:   work = a->mult_work;
1662:   for (i=0; i<mbs; i++) {
1663:     n     = ii[1] - ii[0]; ii++;
1664:     ncols = n*bs;
1665:     workt = work;
1666:     for (j=0; j<n; j++) {
1667:       xb = x + bs*(*idx++);
1668:       for (k=0; k<bs; k++) workt[k] = xb[k];
1669:       workt += bs;
1670:     }
1671:     if (usecprow) z = zarray + bs*ridx[i];
1672:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1673:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1674:     v += n*bs2;
1675:     if (!usecprow) z += bs;
1676:   }
1677:   VecRestoreArrayRead(xx,&x);
1678:   VecRestoreArray(zz,&zarray);
1679:   PetscLogFlops(2.0*a->nz*bs2);
1680:   return(0);
1681: }

1683: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1684: {
1685:   PetscScalar    zero = 0.0;

1689:   VecSet(zz,zero);
1690:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1691:   return(0);
1692: }

1694: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1695: {
1696:   PetscScalar    zero = 0.0;

1700:   VecSet(zz,zero);
1701:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1702:   return(0);
1703: }

1705: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1706: {
1707:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1708:   PetscScalar       *z,x1,x2,x3,x4,x5;
1709:   const PetscScalar *x,*xb = NULL;
1710:   const MatScalar   *v;
1711:   PetscErrorCode    ierr;
1712:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1713:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1714:   Mat_CompressedRow cprow = a->compressedrow;
1715:   PetscBool         usecprow = cprow.use;

1718:   if (yy != zz) { VecCopy(yy,zz); }
1719:   VecGetArrayRead(xx,&x);
1720:   VecGetArray(zz,&z);

1722:   idx = a->j;
1723:   v   = a->a;
1724:   if (usecprow) {
1725:     mbs  = cprow.nrows;
1726:     ii   = cprow.i;
1727:     ridx = cprow.rindex;
1728:   } else {
1729:     mbs=a->mbs;
1730:     ii = a->i;
1731:     xb = x;
1732:   }

1734:   switch (bs) {
1735:   case 1:
1736:     for (i=0; i<mbs; i++) {
1737:       if (usecprow) xb = x + ridx[i];
1738:       x1 = xb[0];
1739:       ib = idx + ii[0];
1740:       n  = ii[1] - ii[0]; ii++;
1741:       for (j=0; j<n; j++) {
1742:         rval     = ib[j];
1743:         z[rval] += PetscConj(*v) * x1;
1744:         v++;
1745:       }
1746:       if (!usecprow) xb++;
1747:     }
1748:     break;
1749:   case 2:
1750:     for (i=0; i<mbs; i++) {
1751:       if (usecprow) xb = x + 2*ridx[i];
1752:       x1 = xb[0]; x2 = xb[1];
1753:       ib = idx + ii[0];
1754:       n  = ii[1] - ii[0]; ii++;
1755:       for (j=0; j<n; j++) {
1756:         rval       = ib[j]*2;
1757:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1758:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1759:         v         += 4;
1760:       }
1761:       if (!usecprow) xb += 2;
1762:     }
1763:     break;
1764:   case 3:
1765:     for (i=0; i<mbs; i++) {
1766:       if (usecprow) xb = x + 3*ridx[i];
1767:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1768:       ib = idx + ii[0];
1769:       n  = ii[1] - ii[0]; ii++;
1770:       for (j=0; j<n; j++) {
1771:         rval       = ib[j]*3;
1772:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1773:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1774:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1775:         v         += 9;
1776:       }
1777:       if (!usecprow) xb += 3;
1778:     }
1779:     break;
1780:   case 4:
1781:     for (i=0; i<mbs; i++) {
1782:       if (usecprow) xb = x + 4*ridx[i];
1783:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1784:       ib = idx + ii[0];
1785:       n  = ii[1] - ii[0]; ii++;
1786:       for (j=0; j<n; j++) {
1787:         rval       = ib[j]*4;
1788:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1789:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1790:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1791:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1792:         v         += 16;
1793:       }
1794:       if (!usecprow) xb += 4;
1795:     }
1796:     break;
1797:   case 5:
1798:     for (i=0; i<mbs; i++) {
1799:       if (usecprow) xb = x + 5*ridx[i];
1800:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1801:       x4 = xb[3]; x5 = xb[4];
1802:       ib = idx + ii[0];
1803:       n  = ii[1] - ii[0]; ii++;
1804:       for (j=0; j<n; j++) {
1805:         rval       = ib[j]*5;
1806:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1807:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1808:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1809:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1810:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1811:         v         += 25;
1812:       }
1813:       if (!usecprow) xb += 5;
1814:     }
1815:     break;
1816:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1817:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1818: #if 0
1819:     {
1820:       PetscInt          ncols,k,bs2=a->bs2;
1821:       PetscScalar       *work,*workt,zb;
1822:       const PetscScalar *xtmp;
1823:       if (!a->mult_work) {
1824:         k    = PetscMax(A->rmap->n,A->cmap->n);
1825:         PetscMalloc1(k+1,&a->mult_work);
1826:       }
1827:       work = a->mult_work;
1828:       xtmp = x;
1829:       for (i=0; i<mbs; i++) {
1830:         n     = ii[1] - ii[0]; ii++;
1831:         ncols = n*bs;
1832:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1833:         if (usecprow) xtmp = x + bs*ridx[i];
1834:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1835:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1836:         v += n*bs2;
1837:         if (!usecprow) xtmp += bs;
1838:         workt = work;
1839:         for (j=0; j<n; j++) {
1840:           zb = z + bs*(*idx++);
1841:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1842:           workt += bs;
1843:         }
1844:       }
1845:     }
1846: #endif
1847:   }
1848:   VecRestoreArrayRead(xx,&x);
1849:   VecRestoreArray(zz,&z);
1850:   PetscLogFlops(2.0*a->nz*a->bs2);
1851:   return(0);
1852: }

1854: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1855: {
1856:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1857:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1858:   const PetscScalar *x,*xb = 0;
1859:   const MatScalar   *v;
1860:   PetscErrorCode    ierr;
1861:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1862:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1863:   Mat_CompressedRow cprow   = a->compressedrow;
1864:   PetscBool         usecprow=cprow.use;

1867:   if (yy != zz) { VecCopy(yy,zz); }
1868:   VecGetArrayRead(xx,&x);
1869:   VecGetArray(zz,&z);

1871:   idx = a->j;
1872:   v   = a->a;
1873:   if (usecprow) {
1874:     mbs  = cprow.nrows;
1875:     ii   = cprow.i;
1876:     ridx = cprow.rindex;
1877:   } else {
1878:     mbs=a->mbs;
1879:     ii = a->i;
1880:     xb = x;
1881:   }

1883:   switch (bs) {
1884:   case 1:
1885:     for (i=0; i<mbs; i++) {
1886:       if (usecprow) xb = x + ridx[i];
1887:       x1 = xb[0];
1888:       ib = idx + ii[0];
1889:       n  = ii[1] - ii[0]; ii++;
1890:       for (j=0; j<n; j++) {
1891:         rval     = ib[j];
1892:         z[rval] += *v * x1;
1893:         v++;
1894:       }
1895:       if (!usecprow) xb++;
1896:     }
1897:     break;
1898:   case 2:
1899:     for (i=0; i<mbs; i++) {
1900:       if (usecprow) xb = x + 2*ridx[i];
1901:       x1 = xb[0]; x2 = xb[1];
1902:       ib = idx + ii[0];
1903:       n  = ii[1] - ii[0]; ii++;
1904:       for (j=0; j<n; j++) {
1905:         rval       = ib[j]*2;
1906:         z[rval++] += v[0]*x1 + v[1]*x2;
1907:         z[rval++] += v[2]*x1 + v[3]*x2;
1908:         v         += 4;
1909:       }
1910:       if (!usecprow) xb += 2;
1911:     }
1912:     break;
1913:   case 3:
1914:     for (i=0; i<mbs; i++) {
1915:       if (usecprow) xb = x + 3*ridx[i];
1916:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1917:       ib = idx + ii[0];
1918:       n  = ii[1] - ii[0]; ii++;
1919:       for (j=0; j<n; j++) {
1920:         rval       = ib[j]*3;
1921:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1922:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1923:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1924:         v         += 9;
1925:       }
1926:       if (!usecprow) xb += 3;
1927:     }
1928:     break;
1929:   case 4:
1930:     for (i=0; i<mbs; i++) {
1931:       if (usecprow) xb = x + 4*ridx[i];
1932:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1933:       ib = idx + ii[0];
1934:       n  = ii[1] - ii[0]; ii++;
1935:       for (j=0; j<n; j++) {
1936:         rval       = ib[j]*4;
1937:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1938:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1939:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1940:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1941:         v         += 16;
1942:       }
1943:       if (!usecprow) xb += 4;
1944:     }
1945:     break;
1946:   case 5:
1947:     for (i=0; i<mbs; i++) {
1948:       if (usecprow) xb = x + 5*ridx[i];
1949:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1950:       x4 = xb[3]; x5 = xb[4];
1951:       ib = idx + ii[0];
1952:       n  = ii[1] - ii[0]; ii++;
1953:       for (j=0; j<n; j++) {
1954:         rval       = ib[j]*5;
1955:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1956:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1957:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1958:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1959:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1960:         v         += 25;
1961:       }
1962:       if (!usecprow) xb += 5;
1963:     }
1964:     break;
1965:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1966:     PetscInt          ncols,k;
1967:     PetscScalar       *work,*workt;
1968:     const PetscScalar *xtmp;
1969:     if (!a->mult_work) {
1970:       k    = PetscMax(A->rmap->n,A->cmap->n);
1971:       PetscMalloc1(k+1,&a->mult_work);
1972:     }
1973:     work = a->mult_work;
1974:     xtmp = x;
1975:     for (i=0; i<mbs; i++) {
1976:       n     = ii[1] - ii[0]; ii++;
1977:       ncols = n*bs;
1978:       PetscMemzero(work,ncols*sizeof(PetscScalar));
1979:       if (usecprow) xtmp = x + bs*ridx[i];
1980:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1981:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1982:       v += n*bs2;
1983:       if (!usecprow) xtmp += bs;
1984:       workt = work;
1985:       for (j=0; j<n; j++) {
1986:         zb = z + bs*(*idx++);
1987:         for (k=0; k<bs; k++) zb[k] += workt[k];
1988:         workt += bs;
1989:       }
1990:     }
1991:     }
1992:   }
1993:   VecRestoreArrayRead(xx,&x);
1994:   VecRestoreArray(zz,&z);
1995:   PetscLogFlops(2.0*a->nz*a->bs2);
1996:   return(0);
1997: }

1999: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2000: {
2001:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2002:   PetscInt       totalnz = a->bs2*a->nz;
2003:   PetscScalar    oalpha  = alpha;
2005:   PetscBLASInt   one = 1,tnz;

2008:   PetscBLASIntCast(totalnz,&tnz);
2009:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2010:   PetscLogFlops(totalnz);
2011:   return(0);
2012: }

2014: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2015: {
2017:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2018:   MatScalar      *v  = a->a;
2019:   PetscReal      sum = 0.0;
2020:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2023:   if (type == NORM_FROBENIUS) {
2024: #if defined(PETSC_USE_REAL___FP16)
2025:     PetscBLASInt one = 1,cnt = bs2*nz;
2026:     *norm = BLASnrm2_(&cnt,v,&one);
2027: #else
2028:     for (i=0; i<bs2*nz; i++) {
2029:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2030:     }
2031: #endif
2032:     *norm = PetscSqrtReal(sum);
2033:     PetscLogFlops(2*bs2*nz);
2034:   } else if (type == NORM_1) { /* maximum column sum */
2035:     PetscReal *tmp;
2036:     PetscInt  *bcol = a->j;
2037:     PetscCalloc1(A->cmap->n+1,&tmp);
2038:     for (i=0; i<nz; i++) {
2039:       for (j=0; j<bs; j++) {
2040:         k1 = bs*(*bcol) + j; /* column index */
2041:         for (k=0; k<bs; k++) {
2042:           tmp[k1] += PetscAbsScalar(*v); v++;
2043:         }
2044:       }
2045:       bcol++;
2046:     }
2047:     *norm = 0.0;
2048:     for (j=0; j<A->cmap->n; j++) {
2049:       if (tmp[j] > *norm) *norm = tmp[j];
2050:     }
2051:     PetscFree(tmp);
2052:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2053:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2054:     *norm = 0.0;
2055:     for (k=0; k<bs; k++) {
2056:       for (j=0; j<a->mbs; j++) {
2057:         v   = a->a + bs2*a->i[j] + k;
2058:         sum = 0.0;
2059:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2060:           for (k1=0; k1<bs; k1++) {
2061:             sum += PetscAbsScalar(*v);
2062:             v   += bs;
2063:           }
2064:         }
2065:         if (sum > *norm) *norm = sum;
2066:       }
2067:     }
2068:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2069:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2070:   return(0);
2071: }


2074: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2075: {
2076:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2080:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2081:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2082:     *flg = PETSC_FALSE;
2083:     return(0);
2084:   }

2086:   /* if the a->i are the same */
2087:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2088:   if (!*flg) return(0);

2090:   /* if a->j are the same */
2091:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2092:   if (!*flg) return(0);

2094:   /* if a->a are the same */
2095:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2096:   return(0);

2098: }

2100: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2101: {
2102:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2104:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2105:   PetscScalar    *x,zero = 0.0;
2106:   MatScalar      *aa,*aa_j;

2109:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2110:   bs   = A->rmap->bs;
2111:   aa   = a->a;
2112:   ai   = a->i;
2113:   aj   = a->j;
2114:   ambs = a->mbs;
2115:   bs2  = a->bs2;

2117:   VecSet(v,zero);
2118:   VecGetArray(v,&x);
2119:   VecGetLocalSize(v,&n);
2120:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2121:   for (i=0; i<ambs; i++) {
2122:     for (j=ai[i]; j<ai[i+1]; j++) {
2123:       if (aj[j] == i) {
2124:         row  = i*bs;
2125:         aa_j = aa+j*bs2;
2126:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2127:         break;
2128:       }
2129:     }
2130:   }
2131:   VecRestoreArray(v,&x);
2132:   return(0);
2133: }

2135: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2136: {
2137:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2138:   const PetscScalar *l,*r,*li,*ri;
2139:   PetscScalar       x;
2140:   MatScalar         *aa, *v;
2141:   PetscErrorCode    ierr;
2142:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2143:   const PetscInt    *ai,*aj;

2146:   ai  = a->i;
2147:   aj  = a->j;
2148:   aa  = a->a;
2149:   m   = A->rmap->n;
2150:   n   = A->cmap->n;
2151:   bs  = A->rmap->bs;
2152:   mbs = a->mbs;
2153:   bs2 = a->bs2;
2154:   if (ll) {
2155:     VecGetArrayRead(ll,&l);
2156:     VecGetLocalSize(ll,&lm);
2157:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2158:     for (i=0; i<mbs; i++) { /* for each block row */
2159:       M  = ai[i+1] - ai[i];
2160:       li = l + i*bs;
2161:       v  = aa + bs2*ai[i];
2162:       for (j=0; j<M; j++) { /* for each block */
2163:         for (k=0; k<bs2; k++) {
2164:           (*v++) *= li[k%bs];
2165:         }
2166:       }
2167:     }
2168:     VecRestoreArrayRead(ll,&l);
2169:     PetscLogFlops(a->nz);
2170:   }

2172:   if (rr) {
2173:     VecGetArrayRead(rr,&r);
2174:     VecGetLocalSize(rr,&rn);
2175:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2176:     for (i=0; i<mbs; i++) { /* for each block row */
2177:       iai = ai[i];
2178:       M   = ai[i+1] - iai;
2179:       v   = aa + bs2*iai;
2180:       for (j=0; j<M; j++) { /* for each block */
2181:         ri = r + bs*aj[iai+j];
2182:         for (k=0; k<bs; k++) {
2183:           x = ri[k];
2184:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2185:           v += bs;
2186:         }
2187:       }
2188:     }
2189:     VecRestoreArrayRead(rr,&r);
2190:     PetscLogFlops(a->nz);
2191:   }
2192:   return(0);
2193: }


2196: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2197: {
2198:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2201:   info->block_size   = a->bs2;
2202:   info->nz_allocated = a->bs2*a->maxnz;
2203:   info->nz_used      = a->bs2*a->nz;
2204:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2205:   info->assemblies   = A->num_ass;
2206:   info->mallocs      = A->info.mallocs;
2207:   info->memory       = ((PetscObject)A)->mem;
2208:   if (A->factortype) {
2209:     info->fill_ratio_given  = A->info.fill_ratio_given;
2210:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2211:     info->factor_mallocs    = A->info.factor_mallocs;
2212:   } else {
2213:     info->fill_ratio_given  = 0;
2214:     info->fill_ratio_needed = 0;
2215:     info->factor_mallocs    = 0;
2216:   }
2217:   return(0);
2218: }

2220: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2221: {
2222:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2226:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2227:   return(0);
2228: }