Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8: #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 15:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 16:   const PetscInt *idx;
 17:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 18:   PetscBT        table;

 21:   m  = a->mbs;
 22:   ai = a->i;
 23:   aj = a->j;
 24:   bs = A->rmap->bs;

 26:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 28:   PetscBTCreate(m,&table);
 29:   PetscMalloc1(m+1,&nidx);
 30:   PetscMalloc1(A->rmap->N+1,&nidx2);

 32:   for (i=0; i<is_max; i++) {
 33:     /* Initialise the two local arrays */
 34:     isz  = 0;
 35:     PetscBTMemzero(m,table);

 37:     /* Extract the indices, assume there can be duplicate entries */
 38:     ISGetIndices(is[i],&idx);
 39:     ISGetLocalSize(is[i],&n);

 41:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 42:     for (j=0; j<n; ++j) {
 43:       ival = idx[j]/bs; /* convert the indices into block indices */
 44:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 45:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);

 50:     k = 0;
 51:     for (j=0; j<ov; j++) { /* for each overlap*/
 52:       n = isz;
 53:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 54:         row   = nidx[k];
 55:         start = ai[row];
 56:         end   = ai[row+1];
 57:         for (l = start; l<end; l++) {
 58:           val = aj[l];
 59:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 60:         }
 61:       }
 62:     }
 63:     /* expand the Index Set */
 64:     for (j=0; j<isz; j++) {
 65:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 66:     }
 67:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 68:   }
 69:   PetscBTDestroy(&table);
 70:   PetscFree(nidx);
 71:   PetscFree(nidx2);
 72:   return(0);
 73: }

 75: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 76: {
 77:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 79:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 80:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 81:   const PetscInt *irow,*icol;
 82:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscBool      flag;

 89:   ISGetIndices(isrow,&irow);
 90:   ISGetIndices(iscol,&icol);
 91:   ISGetLocalSize(isrow,&nrows);
 92:   ISGetLocalSize(iscol,&ncols);

 94:   PetscCalloc1(1+oldcols,&smap);
 95:   ssmap = smap;
 96:   PetscMalloc1(1+nrows,&lens);
 97:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 98:   /* determine lens of each row */
 99:   for (i=0; i<nrows; i++) {
100:     kstart  = ai[irow[i]];
101:     kend    = kstart + a->ilen[irow[i]];
102:     lens[i] = 0;
103:     for (k=kstart; k<kend; k++) {
104:       if (ssmap[aj[k]]) lens[i]++;
105:     }
106:   }
107:   /* Create and fill new matrix */
108:   if (scall == MAT_REUSE_MATRIX) {
109:     c = (Mat_SeqBAIJ*)((*B)->data);

111:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
113:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114:     PetscArrayzero(c->ilen,c->mbs);
115:     C    = *B;
116:   } else {
117:     MatCreate(PetscObjectComm((PetscObject)A),&C);
118:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119:     MatSetType(C,((PetscObject)A)->type_name);
120:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
121:   }
122:   c = (Mat_SeqBAIJ*)(C->data);
123:   for (i=0; i<nrows; i++) {
124:     row      = irow[i];
125:     kstart   = ai[row];
126:     kend     = kstart + a->ilen[row];
127:     mat_i    = c->i[i];
128:     mat_j    = c->j + mat_i;
129:     mat_a    = c->a + mat_i*bs2;
130:     mat_ilen = c->ilen + i;
131:     for (k=kstart; k<kend; k++) {
132:       if ((tcol=ssmap[a->j[k]])) {
133:         *mat_j++ = tcol - 1;
134:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
135:         mat_a   += bs2;
136:         (*mat_ilen)++;
137:       }
138:     }
139:   }
140:   /* sort */
141:   {
142:     MatScalar *work;
143:     PetscMalloc1(bs2,&work);
144:     for (i=0; i<nrows; i++) {
145:       PetscInt ilen;
146:       mat_i = c->i[i];
147:       mat_j = c->j + mat_i;
148:       mat_a = c->a + mat_i*bs2;
149:       ilen  = c->ilen[i];
150:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151:     }
152:     PetscFree(work);
153:   }

155:   /* Free work space */
156:   ISRestoreIndices(iscol,&icol);
157:   PetscFree(smap);
158:   PetscFree(lens);
159:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

162:   ISRestoreIndices(isrow,&irow);
163:   *B   = C;
164:   return(0);
165: }

167: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168: {
169:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
170:   IS             is1,is2;
172:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173:   const PetscInt *irow,*icol;

176:   ISGetIndices(isrow,&irow);
177:   ISGetIndices(iscol,&icol);
178:   ISGetLocalSize(isrow,&nrows);
179:   ISGetLocalSize(iscol,&ncols);

181:   /* Verify if the indices corespond to each element in a block
182:    and form the IS with compressed IS */
183:   maxmnbs = PetscMax(a->mbs,a->nbs);
184:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
185:   PetscArrayzero(vary,a->mbs);
186:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187:   for (i=0; i<a->mbs; i++) {
188:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189:   }
190:   count = 0;
191:   for (i=0; i<nrows; i++) {
192:     j = irow[i] / bs;
193:     if ((vary[j]--)==bs) iary[count++] = j;
194:   }
195:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

197:   PetscArrayzero(vary,a->nbs);
198:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199:   for (i=0; i<a->nbs; i++) {
200:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201:   }
202:   count = 0;
203:   for (i=0; i<ncols; i++) {
204:     j = icol[i] / bs;
205:     if ((vary[j]--)==bs) iary[count++] = j;
206:   }
207:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
208:   ISRestoreIndices(isrow,&irow);
209:   ISRestoreIndices(iscol,&icol);
210:   PetscFree2(vary,iary);

212:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
213:   ISDestroy(&is1);
214:   ISDestroy(&is2);
215:   return(0);
216: }

218: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219: {
221:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
222:   Mat_SubSppt    *submatj = c->submatis1;

225:   (*submatj->destroy)(C);
226:   MatDestroySubMatrix_Private(submatj);
227:   return(0);
228: }

230: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231: {
233:   PetscInt       i;
234:   Mat            C;
235:   Mat_SeqBAIJ    *c;
236:   Mat_SubSppt    *submatj;

239:   for (i=0; i<n; i++) {
240:     C       = (*mat)[i];
241:     c       = (Mat_SeqBAIJ*)C->data;
242:     submatj = c->submatis1;
243:     if (submatj) {
244:       if (--((PetscObject)C)->refct <= 0) {
245:         (*submatj->destroy)(C);
246:         MatDestroySubMatrix_Private(submatj);
247:         PetscFree(C->defaultvectype);
248:         PetscLayoutDestroy(&C->rmap);
249:         PetscLayoutDestroy(&C->cmap);
250:         PetscHeaderDestroy(&C);
251:       }
252:     } else {
253:       MatDestroy(&C);
254:     }
255:   }

257:   /* Destroy Dummy submatrices created for reuse */
258:   MatDestroySubMatrices_Dummy(n,mat);

260:   PetscFree(*mat);
261:   return(0);
262: }

264: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
265: {
267:   PetscInt       i;

270:   if (scall == MAT_INITIAL_MATRIX) {
271:     PetscCalloc1(n+1,B);
272:   }

274:   for (i=0; i<n; i++) {
275:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
276:   }
277:   return(0);
278: }

280: /* -------------------------------------------------------*/
281: /* Should check that shapes of vectors and matrices match */
282: /* -------------------------------------------------------*/

284: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
285: {
286:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
287:   PetscScalar       *z,sum;
288:   const PetscScalar *x;
289:   const MatScalar   *v;
290:   PetscErrorCode    ierr;
291:   PetscInt          mbs,i,n;
292:   const PetscInt    *idx,*ii,*ridx=NULL;
293:   PetscBool         usecprow=a->compressedrow.use;

296:   VecGetArrayRead(xx,&x);
297:   VecGetArrayWrite(zz,&z);

299:   if (usecprow) {
300:     mbs  = a->compressedrow.nrows;
301:     ii   = a->compressedrow.i;
302:     ridx = a->compressedrow.rindex;
303:     PetscArrayzero(z,a->mbs);
304:   } else {
305:     mbs = a->mbs;
306:     ii  = a->i;
307:   }

309:   for (i=0; i<mbs; i++) {
310:     n   = ii[1] - ii[0];
311:     v   = a->a + ii[0];
312:     idx = a->j + ii[0];
313:     ii++;
314:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
315:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
316:     sum = 0.0;
317:     PetscSparseDensePlusDot(sum,x,v,idx,n);
318:     if (usecprow) {
319:       z[ridx[i]] = sum;
320:     } else {
321:       z[i]       = sum;
322:     }
323:   }
324:   VecRestoreArrayRead(xx,&x);
325:   VecRestoreArrayWrite(zz,&z);
326:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
327:   return(0);
328: }

330: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
331: {
332:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333:   PetscScalar       *z = NULL,sum1,sum2,*zarray;
334:   const PetscScalar *x,*xb;
335:   PetscScalar       x1,x2;
336:   const MatScalar   *v;
337:   PetscErrorCode    ierr;
338:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
339:   PetscBool         usecprow=a->compressedrow.use;

342:   VecGetArrayRead(xx,&x);
343:   VecGetArrayWrite(zz,&zarray);

345:   idx = a->j;
346:   v   = a->a;
347:   if (usecprow) {
348:     mbs  = a->compressedrow.nrows;
349:     ii   = a->compressedrow.i;
350:     ridx = a->compressedrow.rindex;
351:     PetscArrayzero(zarray,2*a->mbs);
352:   } else {
353:     mbs = a->mbs;
354:     ii  = a->i;
355:     z   = zarray;
356:   }

358:   for (i=0; i<mbs; i++) {
359:     n           = ii[1] - ii[0]; ii++;
360:     sum1        = 0.0; sum2 = 0.0;
361:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
362:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
363:     for (j=0; j<n; j++) {
364:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
365:       sum1 += v[0]*x1 + v[2]*x2;
366:       sum2 += v[1]*x1 + v[3]*x2;
367:       v    += 4;
368:     }
369:     if (usecprow) z = zarray + 2*ridx[i];
370:     z[0] = sum1; z[1] = sum2;
371:     if (!usecprow) z += 2;
372:   }
373:   VecRestoreArrayRead(xx,&x);
374:   VecRestoreArrayWrite(zz,&zarray);
375:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
376:   return(0);
377: }

379: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
380: {
381:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
382:   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
383:   const PetscScalar *x,*xb;
384:   const MatScalar   *v;
385:   PetscErrorCode    ierr;
386:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
387:   PetscBool         usecprow=a->compressedrow.use;

389: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
390: #pragma disjoint(*v,*z,*xb)
391: #endif

394:   VecGetArrayRead(xx,&x);
395:   VecGetArrayWrite(zz,&zarray);

397:   idx = a->j;
398:   v   = a->a;
399:   if (usecprow) {
400:     mbs  = a->compressedrow.nrows;
401:     ii   = a->compressedrow.i;
402:     ridx = a->compressedrow.rindex;
403:     PetscArrayzero(zarray,3*a->mbs);
404:   } else {
405:     mbs = a->mbs;
406:     ii  = a->i;
407:     z   = zarray;
408:   }

410:   for (i=0; i<mbs; i++) {
411:     n           = ii[1] - ii[0]; ii++;
412:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
413:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
414:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415:     for (j=0; j<n; j++) {
416:       xb = x + 3*(*idx++);
417:       x1 = xb[0];
418:       x2 = xb[1];
419:       x3 = xb[2];

421:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
422:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
423:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
424:       v    += 9;
425:     }
426:     if (usecprow) z = zarray + 3*ridx[i];
427:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
428:     if (!usecprow) z += 3;
429:   }
430:   VecRestoreArrayRead(xx,&x);
431:   VecRestoreArrayWrite(zz,&zarray);
432:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
433:   return(0);
434: }

436: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
437: {
438:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
439:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
440:   const PetscScalar *x,*xb;
441:   const MatScalar   *v;
442:   PetscErrorCode    ierr;
443:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
444:   PetscBool         usecprow=a->compressedrow.use;

447:   VecGetArrayRead(xx,&x);
448:   VecGetArrayWrite(zz,&zarray);

450:   idx = a->j;
451:   v   = a->a;
452:   if (usecprow) {
453:     mbs  = a->compressedrow.nrows;
454:     ii   = a->compressedrow.i;
455:     ridx = a->compressedrow.rindex;
456:     PetscArrayzero(zarray,4*a->mbs);
457:   } else {
458:     mbs = a->mbs;
459:     ii  = a->i;
460:     z   = zarray;
461:   }

463:   for (i=0; i<mbs; i++) {
464:     n = ii[1] - ii[0];
465:     ii++;
466:     sum1 = 0.0;
467:     sum2 = 0.0;
468:     sum3 = 0.0;
469:     sum4 = 0.0;

471:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
472:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
473:     for (j=0; j<n; j++) {
474:       xb    = x + 4*(*idx++);
475:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
476:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
477:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
478:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
479:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
480:       v    += 16;
481:     }
482:     if (usecprow) z = zarray + 4*ridx[i];
483:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
484:     if (!usecprow) z += 4;
485:   }
486:   VecRestoreArrayRead(xx,&x);
487:   VecRestoreArrayWrite(zz,&zarray);
488:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
489:   return(0);
490: }

492: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
493: {
494:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
495:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
496:   const PetscScalar *xb,*x;
497:   const MatScalar   *v;
498:   PetscErrorCode    ierr;
499:   const PetscInt    *idx,*ii,*ridx=NULL;
500:   PetscInt          mbs,i,j,n;
501:   PetscBool         usecprow=a->compressedrow.use;

504:   VecGetArrayRead(xx,&x);
505:   VecGetArrayWrite(zz,&zarray);

507:   idx = a->j;
508:   v   = a->a;
509:   if (usecprow) {
510:     mbs  = a->compressedrow.nrows;
511:     ii   = a->compressedrow.i;
512:     ridx = a->compressedrow.rindex;
513:     PetscArrayzero(zarray,5*a->mbs);
514:   } else {
515:     mbs = a->mbs;
516:     ii  = a->i;
517:     z   = zarray;
518:   }

520:   for (i=0; i<mbs; i++) {
521:     n           = ii[1] - ii[0]; ii++;
522:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
523:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
524:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
525:     for (j=0; j<n; j++) {
526:       xb    = x + 5*(*idx++);
527:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
528:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
529:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
530:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
531:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
532:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
533:       v    += 25;
534:     }
535:     if (usecprow) z = zarray + 5*ridx[i];
536:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
537:     if (!usecprow) z += 5;
538:   }
539:   VecRestoreArrayRead(xx,&x);
540:   VecRestoreArrayWrite(zz,&zarray);
541:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
542:   return(0);
543: }

545: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
546: {
547:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
548:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
549:   const PetscScalar *x,*xb;
550:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
551:   const MatScalar   *v;
552:   PetscErrorCode    ierr;
553:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
554:   PetscBool         usecprow=a->compressedrow.use;

557:   VecGetArrayRead(xx,&x);
558:   VecGetArrayWrite(zz,&zarray);

560:   idx = a->j;
561:   v   = a->a;
562:   if (usecprow) {
563:     mbs  = a->compressedrow.nrows;
564:     ii   = a->compressedrow.i;
565:     ridx = a->compressedrow.rindex;
566:     PetscArrayzero(zarray,6*a->mbs);
567:   } else {
568:     mbs = a->mbs;
569:     ii  = a->i;
570:     z   = zarray;
571:   }

573:   for (i=0; i<mbs; i++) {
574:     n  = ii[1] - ii[0];
575:     ii++;
576:     sum1 = 0.0;
577:     sum2 = 0.0;
578:     sum3 = 0.0;
579:     sum4 = 0.0;
580:     sum5 = 0.0;
581:     sum6 = 0.0;

583:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
584:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585:     for (j=0; j<n; j++) {
586:       xb    = x + 6*(*idx++);
587:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
588:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
589:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
590:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
591:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
592:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
593:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
594:       v    += 36;
595:     }
596:     if (usecprow) z = zarray + 6*ridx[i];
597:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
598:     if (!usecprow) z += 6;
599:   }

601:   VecRestoreArrayRead(xx,&x);
602:   VecRestoreArrayWrite(zz,&zarray);
603:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
604:   return(0);
605: }

607: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
608: {
609:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
610:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
611:   const PetscScalar *x,*xb;
612:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
613:   const MatScalar   *v;
614:   PetscErrorCode    ierr;
615:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
616:   PetscBool         usecprow=a->compressedrow.use;

619:   VecGetArrayRead(xx,&x);
620:   VecGetArrayWrite(zz,&zarray);

622:   idx = a->j;
623:   v   = a->a;
624:   if (usecprow) {
625:     mbs  = a->compressedrow.nrows;
626:     ii   = a->compressedrow.i;
627:     ridx = a->compressedrow.rindex;
628:     PetscArrayzero(zarray,7*a->mbs);
629:   } else {
630:     mbs = a->mbs;
631:     ii  = a->i;
632:     z   = zarray;
633:   }

635:   for (i=0; i<mbs; i++) {
636:     n  = ii[1] - ii[0];
637:     ii++;
638:     sum1 = 0.0;
639:     sum2 = 0.0;
640:     sum3 = 0.0;
641:     sum4 = 0.0;
642:     sum5 = 0.0;
643:     sum6 = 0.0;
644:     sum7 = 0.0;

646:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
647:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
648:     for (j=0; j<n; j++) {
649:       xb    = x + 7*(*idx++);
650:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
651:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
652:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
653:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
654:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
655:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
656:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
657:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
658:       v    += 49;
659:     }
660:     if (usecprow) z = zarray + 7*ridx[i];
661:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
662:     if (!usecprow) z += 7;
663:   }

665:   VecRestoreArrayRead(xx,&x);
666:   VecRestoreArrayWrite(zz,&zarray);
667:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
668:   return(0);
669: }

671: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
672: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
673: {
674:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
675:   PetscScalar       *z = NULL,*work,*workt,*zarray;
676:   const PetscScalar *x,*xb;
677:   const MatScalar   *v;
678:   PetscErrorCode    ierr;
679:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
680:   const PetscInt    *idx,*ii,*ridx=NULL;
681:   PetscInt          k;
682:   PetscBool         usecprow=a->compressedrow.use;

684:   __m256d a0,a1,a2,a3,a4,a5;
685:   __m256d w0,w1,w2,w3;
686:   __m256d z0,z1,z2;
687:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

690:   VecGetArrayRead(xx,&x);
691:   VecGetArrayWrite(zz,&zarray);

693:   idx = a->j;
694:   v   = a->a;
695:   if (usecprow) {
696:     mbs  = a->compressedrow.nrows;
697:     ii   = a->compressedrow.i;
698:     ridx = a->compressedrow.rindex;
699:     PetscArrayzero(zarray,bs*a->mbs);
700:   } else {
701:     mbs = a->mbs;
702:     ii  = a->i;
703:     z   = zarray;
704:   }

706:   if (!a->mult_work) {
707:     k    = PetscMax(A->rmap->n,A->cmap->n);
708:     PetscMalloc1(k+1,&a->mult_work);
709:   }

711:   work = a->mult_work;
712:   for (i=0; i<mbs; i++) {
713:     n           = ii[1] - ii[0]; ii++;
714:     workt       = work;
715:     for (j=0; j<n; j++) {
716:       xb = x + bs*(*idx++);
717:       for (k=0; k<bs; k++) workt[k] = xb[k];
718:       workt += bs;
719:     }
720:     if (usecprow) z = zarray + bs*ridx[i];

722:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

724:     for (j=0; j<n; j++) {
725:       /* first column of a */
726:       w0 = _mm256_set1_pd(work[j*9  ]);
727:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
728:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
729:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

731:       /* second column of a */
732:       w1 = _mm256_set1_pd(work[j*9+ 1]);
733:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
734:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
735:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

737:       /* third column of a */
738:       w2 = _mm256_set1_pd(work[j*9 +2]);
739:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
740:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
741:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

743:       /* fourth column of a */
744:       w3 = _mm256_set1_pd(work[j*9+ 3]);
745:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
746:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
747:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

749:       /* fifth column of a */
750:       w0 = _mm256_set1_pd(work[j*9+ 4]);
751:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
752:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
753:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

755:       /* sixth column of a */
756:       w1 = _mm256_set1_pd(work[j*9+ 5]);
757:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
758:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
759:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

761:       /* seventh column of a */
762:       w2 = _mm256_set1_pd(work[j*9+ 6]);
763:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
764:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
765:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

767:       /* eigth column of a */
768:       w3 = _mm256_set1_pd(work[j*9+ 7]);
769:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
770:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
771:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

773:       /* ninth column of a */
774:       w0 = _mm256_set1_pd(work[j*9+ 8]);
775:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
776:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
777:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
778:     }

780:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

782:     v += n*bs2;
783:     if (!usecprow) z += bs;
784:   }
785:   VecRestoreArrayRead(xx,&x);
786:   VecRestoreArrayWrite(zz,&zarray);
787:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
788:   return(0);
789: }
790: #endif

792: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
793: {
794:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
795:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
796:   const PetscScalar *x,*xb;
797:   PetscScalar       *zarray,xv;
798:   const MatScalar   *v;
799:   PetscErrorCode    ierr;
800:   const PetscInt    *ii,*ij=a->j,*idx;
801:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
802:   PetscBool         usecprow=a->compressedrow.use;

805:   VecGetArrayRead(xx,&x);
806:   VecGetArrayWrite(zz,&zarray);

808:   v = a->a;
809:   if (usecprow) {
810:     mbs  = a->compressedrow.nrows;
811:     ii   = a->compressedrow.i;
812:     ridx = a->compressedrow.rindex;
813:     PetscArrayzero(zarray,11*a->mbs);
814:   } else {
815:     mbs = a->mbs;
816:     ii  = a->i;
817:     z   = zarray;
818:   }

820:   for (i=0; i<mbs; i++) {
821:     n    = ii[i+1] - ii[i];
822:     idx  = ij + ii[i];
823:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
824:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

826:     for (j=0; j<n; j++) {
827:       xb = x + 11*(idx[j]);

829:       for (k=0; k<11; k++) {
830:         xv     =  xb[k];
831:         sum1  += v[0]*xv;
832:         sum2  += v[1]*xv;
833:         sum3  += v[2]*xv;
834:         sum4  += v[3]*xv;
835:         sum5  += v[4]*xv;
836:         sum6  += v[5]*xv;
837:         sum7  += v[6]*xv;
838:         sum8  += v[7]*xv;
839:         sum9  += v[8]*xv;
840:         sum10 += v[9]*xv;
841:         sum11 += v[10]*xv;
842:         v     += 11;
843:       }
844:     }
845:     if (usecprow) z = zarray + 11*ridx[i];
846:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
847:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

849:     if (!usecprow) z += 11;
850:   }

852:   VecRestoreArrayRead(xx,&x);
853:   VecRestoreArrayWrite(zz,&zarray);
854:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
855:   return(0);
856: }

858: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
859: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
860: {
861:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
862:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
863:   const PetscScalar *x,*xb;
864:   PetscScalar       *zarray,xv;
865:   const MatScalar   *v;
866:   PetscErrorCode    ierr;
867:   const PetscInt    *ii,*ij=a->j,*idx;
868:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
869:   PetscBool         usecprow=a->compressedrow.use;

872:   VecGetArrayRead(xx,&x);
873:   VecGetArrayWrite(zz,&zarray);

875:   v = a->a;
876:   if (usecprow) {
877:     mbs  = a->compressedrow.nrows;
878:     ii   = a->compressedrow.i;
879:     ridx = a->compressedrow.rindex;
880:     PetscArrayzero(zarray,12*a->mbs);
881:   } else {
882:     mbs = a->mbs;
883:     ii  = a->i;
884:     z   = zarray;
885:   }

887:   for (i=0; i<mbs; i++) {
888:     n    = ii[i+1] - ii[i];
889:     idx  = ij + ii[i];
890:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
891:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;

893:     for (j=0; j<n; j++) {
894:       xb = x + 12*(idx[j]);

896:       for (k=0; k<12; k++) {
897:         xv     =  xb[k];
898:         sum1  += v[0]*xv;
899:         sum2  += v[1]*xv;
900:         sum3  += v[2]*xv;
901:         sum4  += v[3]*xv;
902:         sum5  += v[4]*xv;
903:         sum6  += v[5]*xv;
904:         sum7  += v[6]*xv;
905:         sum8  += v[7]*xv;
906:         sum9  += v[8]*xv;
907:         sum10 += v[9]*xv;
908:         sum11 += v[10]*xv;
909:         sum12 += v[11]*xv;
910:         v     += 12;
911:       }
912:     }
913:     if (usecprow) z = zarray + 12*ridx[i];
914:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
916:     if (!usecprow) z += 12;
917:   }
918:   VecRestoreArrayRead(xx,&x);
919:   VecRestoreArrayWrite(zz,&zarray);
920:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
921:   return(0);
922: }

924: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
925: {
926:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
927:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
928:   const PetscScalar *x,*xb;
929:   PetscScalar       *zarray,*yarray,xv;
930:   const MatScalar   *v;
931:   PetscErrorCode    ierr;
932:   const PetscInt    *ii,*ij=a->j,*idx;
933:   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
934:   PetscBool         usecprow=a->compressedrow.use;

937:   VecGetArrayRead(xx,&x);
938:   VecGetArrayPair(yy,zz,&yarray,&zarray);

940:   v = a->a;
941:   if (usecprow) {
942:    if (zz != yy) {
943:      PetscArraycpy(zarray,yarray,12*mbs);
944:     }
945:     mbs  = a->compressedrow.nrows;
946:     ii   = a->compressedrow.i;
947:     ridx = a->compressedrow.rindex;
948:   } else {
949:     ii  = a->i;
950:     y   = yarray;
951:     z   = zarray;
952:   }

954:   for (i=0; i<mbs; i++) {
955:     n    = ii[i+1] - ii[i];
956:     idx  = ij + ii[i];

958:     if (usecprow) {
959:       y = yarray + 12*ridx[i];
960:       z = zarray + 12*ridx[i];
961:     }
962:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
963:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

965:     for (j=0; j<n; j++) {
966:       xb = x + 12*(idx[j]);

968:       for (k=0; k<12; k++) {
969:         xv     =  xb[k];
970:         sum1  += v[0]*xv;
971:         sum2  += v[1]*xv;
972:         sum3  += v[2]*xv;
973:         sum4  += v[3]*xv;
974:         sum5  += v[4]*xv;
975:         sum6  += v[5]*xv;
976:         sum7  += v[6]*xv;
977:         sum8  += v[7]*xv;
978:         sum9  += v[8]*xv;
979:         sum10 += v[9]*xv;
980:         sum11 += v[10]*xv;
981:         sum12 += v[11]*xv;
982:         v     += 12;
983:       }
984:     }

986:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
987:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
988:     if (!usecprow) {
989:       y += 12;
990:       z += 12;
991:     }
992:   }
993:   VecRestoreArrayRead(xx,&x);
994:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
995:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
996:   return(0);
997: }

999: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1000: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1001: {
1002:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1003:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1004:   const PetscScalar *x,*xb;
1005:   PetscScalar       x1,x2,x3,x4,*zarray;
1006:   const MatScalar   *v;
1007:   PetscErrorCode    ierr;
1008:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1009:   PetscInt          mbs,i,j,n;
1010:   PetscBool         usecprow=a->compressedrow.use;

1013:   VecGetArrayRead(xx,&x);
1014:   VecGetArrayWrite(zz,&zarray);

1016:   v = a->a;
1017:   if (usecprow) {
1018:     mbs  = a->compressedrow.nrows;
1019:     ii   = a->compressedrow.i;
1020:     ridx = a->compressedrow.rindex;
1021:     PetscArrayzero(zarray,12*a->mbs);
1022:   } else {
1023:     mbs = a->mbs;
1024:     ii  = a->i;
1025:     z   = zarray;
1026:   }

1028:   for (i=0; i<mbs; i++) {
1029:     n    = ii[i+1] - ii[i];
1030:     idx  = ij + ii[i];

1032:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1033:     for (j=0; j<n; j++) {
1034:       xb = x + 12*(idx[j]);
1035:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1037:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1038:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1039:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1040:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1041:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1042:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1043:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1044:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1045:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1046:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1047:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1048:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1049:       v += 48;

1051:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1053:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1054:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1055:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1056:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1057:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1058:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1059:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1060:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1061:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1062:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1063:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1064:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1065:       v     += 48;

1067:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1068:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1069:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1070:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1071:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1072:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1073:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1074:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1075:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1076:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1077:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1078:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1079:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1080:       v     += 48;

1082:     }
1083:     if (usecprow) z = zarray + 12*ridx[i];
1084:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1085:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1086:     if (!usecprow) z += 12;
1087:   }
1088:   VecRestoreArrayRead(xx,&x);
1089:   VecRestoreArrayWrite(zz,&zarray);
1090:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1091:   return(0);
1092: }

1094: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1095: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1096: {
1097:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1098:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1099:   const PetscScalar *x,*xb;
1100:   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1101:   const MatScalar   *v;
1102:   PetscErrorCode    ierr;
1103:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1104:   PetscInt          mbs = a->mbs,i,j,n;
1105:   PetscBool         usecprow=a->compressedrow.use;

1108:   VecGetArrayRead(xx,&x);
1109:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1111:   v = a->a;
1112:   if (usecprow) {
1113:     if (zz != yy) {
1114:       PetscArraycpy(zarray,yarray,12*mbs);
1115:     }
1116:     mbs  = a->compressedrow.nrows;
1117:     ii   = a->compressedrow.i;
1118:     ridx = a->compressedrow.rindex;
1119:   } else {
1120:     ii  = a->i;
1121:     y   = yarray;
1122:     z   = zarray;
1123:   }

1125:   for (i=0; i<mbs; i++) {
1126:     n    = ii[i+1] - ii[i];
1127:     idx  = ij + ii[i];

1129:     if (usecprow) {
1130:       y = yarray + 12*ridx[i];
1131:       z = zarray + 12*ridx[i];
1132:     }
1133:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1134:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

1136:     for (j=0; j<n; j++) {
1137:       xb = x + 12*(idx[j]);
1138:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1140:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1141:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1142:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1143:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1144:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1145:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1146:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1147:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1148:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1149:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1150:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1151:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1152:       v += 48;

1154:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1156:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1157:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1158:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1159:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1160:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1161:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1162:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1163:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1164:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1165:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1166:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1167:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1168:       v     += 48;

1170:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1171:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1172:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1173:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1174:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1175:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1176:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1177:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1178:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1179:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1180:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1181:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1182:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1183:       v     += 48;

1185:     }
1186:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1187:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1188:     if (!usecprow) {
1189:       y += 12;
1190:       z += 12;
1191:     }
1192:   }
1193:   VecRestoreArrayRead(xx,&x);
1194:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1195:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1196:   return(0);
1197: }

1199: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1200: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1201: {
1202:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1203:   PetscScalar       *z = NULL,*zarray;
1204:   const PetscScalar *x,*work;
1205:   const MatScalar   *v = a->a;
1206:   PetscErrorCode    ierr;
1207:   PetscInt          mbs,i,j,n;
1208:   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1209:   PetscBool         usecprow=a->compressedrow.use;
1210:   const PetscInt    bs = 12, bs2 = 144;

1212:   __m256d a0,a1,a2,a3,a4,a5;
1213:   __m256d w0,w1,w2,w3;
1214:   __m256d z0,z1,z2;

1217:   VecGetArrayRead(xx,&x);
1218:   VecGetArrayWrite(zz,&zarray);

1220:   if (usecprow) {
1221:     mbs  = a->compressedrow.nrows;
1222:     ii   = a->compressedrow.i;
1223:     ridx = a->compressedrow.rindex;
1224:     PetscArrayzero(zarray,bs*a->mbs);
1225:   } else {
1226:     mbs = a->mbs;
1227:     ii  = a->i;
1228:     z   = zarray;
1229:   }

1231:   for (i=0; i<mbs; i++) {
1232:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

1234:     n  = ii[1] - ii[0]; ii++;
1235:     for (j=0; j<n; j++) {
1236:       work = x + bs*(*idx++);

1238:       /* first column of a */
1239:       w0 = _mm256_set1_pd(work[0]);
1240:       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1241:       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1242:       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);

1244:       /* second column of a */
1245:       w1 = _mm256_set1_pd(work[1]);
1246:       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1247:       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1248:       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);

1250:       /* third column of a */
1251:       w2 = _mm256_set1_pd(work[2]);
1252:       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1253:       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1254:       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);

1256:       /* fourth column of a */
1257:       w3 = _mm256_set1_pd(work[3]);
1258:       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1259:       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1260:       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);

1262:       /* fifth column of a */
1263:       w0 = _mm256_set1_pd(work[4]);
1264:       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1265:       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1266:       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);

1268:       /* sixth column of a */
1269:       w1 = _mm256_set1_pd(work[5]);
1270:       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1271:       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1272:       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);

1274:       /* seventh column of a */
1275:       w2 = _mm256_set1_pd(work[6]);
1276:       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1277:       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1278:       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);

1280:       /* eigth column of a */
1281:       w3 = _mm256_set1_pd(work[7]);
1282:       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1283:       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1284:       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);

1286:       /* ninth column of a */
1287:       w0 = _mm256_set1_pd(work[8]);
1288:       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1289:       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1290:       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);

1292:       /* tenth column of a */
1293:       w1 = _mm256_set1_pd(work[9]);
1294:       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1295:       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1296:       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);

1298:       /* eleventh column of a */
1299:       w2 = _mm256_set1_pd(work[10]);
1300:       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1301:       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1302:       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);

1304:       /* twelveth column of a */
1305:       w3 = _mm256_set1_pd(work[11]);
1306:       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1307:       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1308:       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);

1310:       v += bs2;
1311:     }
1312:     if (usecprow) z = zarray + bs*ridx[i];
1313:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1314:     if (!usecprow) z += bs;
1315:   }
1316:   VecRestoreArrayRead(xx,&x);
1317:   VecRestoreArrayWrite(zz,&zarray);
1318:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1319:   return(0);
1320: }
1321: #endif

1323: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1324: /* Default MatMult for block size 15 */
1325: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1326: {
1327:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1328:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1329:   const PetscScalar *x,*xb;
1330:   PetscScalar       *zarray,xv;
1331:   const MatScalar   *v;
1332:   PetscErrorCode    ierr;
1333:   const PetscInt    *ii,*ij=a->j,*idx;
1334:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1335:   PetscBool         usecprow=a->compressedrow.use;

1338:   VecGetArrayRead(xx,&x);
1339:   VecGetArrayWrite(zz,&zarray);

1341:   v = a->a;
1342:   if (usecprow) {
1343:     mbs  = a->compressedrow.nrows;
1344:     ii   = a->compressedrow.i;
1345:     ridx = a->compressedrow.rindex;
1346:     PetscArrayzero(zarray,15*a->mbs);
1347:   } else {
1348:     mbs = a->mbs;
1349:     ii  = a->i;
1350:     z   = zarray;
1351:   }

1353:   for (i=0; i<mbs; i++) {
1354:     n    = ii[i+1] - ii[i];
1355:     idx  = ij + ii[i];
1356:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1357:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1359:     for (j=0; j<n; j++) {
1360:       xb = x + 15*(idx[j]);

1362:       for (k=0; k<15; k++) {
1363:         xv     =  xb[k];
1364:         sum1  += v[0]*xv;
1365:         sum2  += v[1]*xv;
1366:         sum3  += v[2]*xv;
1367:         sum4  += v[3]*xv;
1368:         sum5  += v[4]*xv;
1369:         sum6  += v[5]*xv;
1370:         sum7  += v[6]*xv;
1371:         sum8  += v[7]*xv;
1372:         sum9  += v[8]*xv;
1373:         sum10 += v[9]*xv;
1374:         sum11 += v[10]*xv;
1375:         sum12 += v[11]*xv;
1376:         sum13 += v[12]*xv;
1377:         sum14 += v[13]*xv;
1378:         sum15 += v[14]*xv;
1379:         v     += 15;
1380:       }
1381:     }
1382:     if (usecprow) z = zarray + 15*ridx[i];
1383:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1384:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1386:     if (!usecprow) z += 15;
1387:   }

1389:   VecRestoreArrayRead(xx,&x);
1390:   VecRestoreArrayWrite(zz,&zarray);
1391:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1392:   return(0);
1393: }

1395: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1396: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1397: {
1398:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1399:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1400:   const PetscScalar *x,*xb;
1401:   PetscScalar       x1,x2,x3,x4,*zarray;
1402:   const MatScalar   *v;
1403:   PetscErrorCode    ierr;
1404:   const PetscInt    *ii,*ij=a->j,*idx;
1405:   PetscInt          mbs,i,j,n,*ridx=NULL;
1406:   PetscBool         usecprow=a->compressedrow.use;

1409:   VecGetArrayRead(xx,&x);
1410:   VecGetArrayWrite(zz,&zarray);

1412:   v = a->a;
1413:   if (usecprow) {
1414:     mbs  = a->compressedrow.nrows;
1415:     ii   = a->compressedrow.i;
1416:     ridx = a->compressedrow.rindex;
1417:     PetscArrayzero(zarray,15*a->mbs);
1418:   } else {
1419:     mbs = a->mbs;
1420:     ii  = a->i;
1421:     z   = zarray;
1422:   }

1424:   for (i=0; i<mbs; i++) {
1425:     n    = ii[i+1] - ii[i];
1426:     idx  = ij + ii[i];
1427:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1428:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1430:     for (j=0; j<n; j++) {
1431:       xb = x + 15*(idx[j]);
1432:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1434:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1435:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1436:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1437:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1438:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1439:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1440:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1441:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1442:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1443:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1444:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1445:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1446:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1447:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1448:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

1450:       v += 60;

1452:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1454:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1455:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1456:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1457:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1458:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1459:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1460:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1461:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1462:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1463:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1464:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1465:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1466:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1467:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1468:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1469:       v     += 60;

1471:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1472:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1473:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1474:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1475:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1476:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1477:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1478:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1479:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1480:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1481:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1482:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1483:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1484:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1485:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1486:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1487:       v     += 60;

1489:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1490:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1491:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1492:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1493:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1494:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1495:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1496:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1497:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1498:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1499:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1500:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1501:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1502:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1503:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1504:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1505:       v     += 45;
1506:     }
1507:     if (usecprow) z = zarray + 15*ridx[i];
1508:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1509:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1511:     if (!usecprow) z += 15;
1512:   }

1514:   VecRestoreArrayRead(xx,&x);
1515:   VecRestoreArrayWrite(zz,&zarray);
1516:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1517:   return(0);
1518: }

1520: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1521: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1522: {
1523:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1524:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1525:   const PetscScalar *x,*xb;
1526:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1527:   const MatScalar   *v;
1528:   PetscErrorCode    ierr;
1529:   const PetscInt    *ii,*ij=a->j,*idx;
1530:   PetscInt          mbs,i,j,n,*ridx=NULL;
1531:   PetscBool         usecprow=a->compressedrow.use;

1534:   VecGetArrayRead(xx,&x);
1535:   VecGetArrayWrite(zz,&zarray);

1537:   v = a->a;
1538:   if (usecprow) {
1539:     mbs  = a->compressedrow.nrows;
1540:     ii   = a->compressedrow.i;
1541:     ridx = a->compressedrow.rindex;
1542:     PetscArrayzero(zarray,15*a->mbs);
1543:   } else {
1544:     mbs = a->mbs;
1545:     ii  = a->i;
1546:     z   = zarray;
1547:   }

1549:   for (i=0; i<mbs; i++) {
1550:     n    = ii[i+1] - ii[i];
1551:     idx  = ij + ii[i];
1552:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1553:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1555:     for (j=0; j<n; j++) {
1556:       xb = x + 15*(idx[j]);
1557:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1558:       x8 = xb[7];

1560:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1561:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1562:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1563:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1564:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1565:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1566:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1567:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1568:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1569:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1570:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1571:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1572:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1573:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1574:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1575:       v     += 120;

1577:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1579:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1580:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1581:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1582:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1583:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1584:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1585:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1586:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1587:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1588:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1589:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1590:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1591:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1592:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1593:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1594:       v     += 105;
1595:     }
1596:     if (usecprow) z = zarray + 15*ridx[i];
1597:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1598:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1600:     if (!usecprow) z += 15;
1601:   }

1603:   VecRestoreArrayRead(xx,&x);
1604:   VecRestoreArrayWrite(zz,&zarray);
1605:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1606:   return(0);
1607: }

1609: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1610: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1611: {
1612:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1613:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1614:   const PetscScalar *x,*xb;
1615:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1616:   const MatScalar   *v;
1617:   PetscErrorCode    ierr;
1618:   const PetscInt    *ii,*ij=a->j,*idx;
1619:   PetscInt          mbs,i,j,n,*ridx=NULL;
1620:   PetscBool         usecprow=a->compressedrow.use;

1623:   VecGetArrayRead(xx,&x);
1624:   VecGetArrayWrite(zz,&zarray);

1626:   v = a->a;
1627:   if (usecprow) {
1628:     mbs  = a->compressedrow.nrows;
1629:     ii   = a->compressedrow.i;
1630:     ridx = a->compressedrow.rindex;
1631:     PetscArrayzero(zarray,15*a->mbs);
1632:   } else {
1633:     mbs = a->mbs;
1634:     ii  = a->i;
1635:     z   = zarray;
1636:   }

1638:   for (i=0; i<mbs; i++) {
1639:     n    = ii[i+1] - ii[i];
1640:     idx  = ij + ii[i];
1641:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1642:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1644:     for (j=0; j<n; j++) {
1645:       xb = x + 15*(idx[j]);
1646:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1647:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1649:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1650:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1651:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1652:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1653:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1654:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1655:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1656:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1657:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1658:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1659:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1660:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1661:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1662:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1663:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1664:       v     += 225;
1665:     }
1666:     if (usecprow) z = zarray + 15*ridx[i];
1667:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1668:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1670:     if (!usecprow) z += 15;
1671:   }

1673:   VecRestoreArrayRead(xx,&x);
1674:   VecRestoreArrayWrite(zz,&zarray);
1675:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1676:   return(0);
1677: }

1679: /*
1680:     This will not work with MatScalar == float because it calls the BLAS
1681: */
1682: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1683: {
1684:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1685:   PetscScalar       *z = NULL,*work,*workt,*zarray;
1686:   const PetscScalar *x,*xb;
1687:   const MatScalar   *v;
1688:   PetscErrorCode    ierr;
1689:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1690:   const PetscInt    *idx,*ii,*ridx=NULL;
1691:   PetscInt          ncols,k;
1692:   PetscBool         usecprow=a->compressedrow.use;

1695:   VecGetArrayRead(xx,&x);
1696:   VecGetArrayWrite(zz,&zarray);

1698:   idx = a->j;
1699:   v   = a->a;
1700:   if (usecprow) {
1701:     mbs  = a->compressedrow.nrows;
1702:     ii   = a->compressedrow.i;
1703:     ridx = a->compressedrow.rindex;
1704:     PetscArrayzero(zarray,bs*a->mbs);
1705:   } else {
1706:     mbs = a->mbs;
1707:     ii  = a->i;
1708:     z   = zarray;
1709:   }

1711:   if (!a->mult_work) {
1712:     k    = PetscMax(A->rmap->n,A->cmap->n);
1713:     PetscMalloc1(k+1,&a->mult_work);
1714:   }
1715:   work = a->mult_work;
1716:   for (i=0; i<mbs; i++) {
1717:     n           = ii[1] - ii[0]; ii++;
1718:     ncols       = n*bs;
1719:     workt       = work;
1720:     for (j=0; j<n; j++) {
1721:       xb = x + bs*(*idx++);
1722:       for (k=0; k<bs; k++) workt[k] = xb[k];
1723:       workt += bs;
1724:     }
1725:     if (usecprow) z = zarray + bs*ridx[i];
1726:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1727:     v += n*bs2;
1728:     if (!usecprow) z += bs;
1729:   }
1730:   VecRestoreArrayRead(xx,&x);
1731:   VecRestoreArrayWrite(zz,&zarray);
1732:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1733:   return(0);
1734: }

1736: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1737: {
1738:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1739:   const PetscScalar *x;
1740:   PetscScalar       *y,*z,sum;
1741:   const MatScalar   *v;
1742:   PetscErrorCode    ierr;
1743:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1744:   const PetscInt    *idx,*ii;
1745:   PetscBool         usecprow=a->compressedrow.use;

1748:   VecGetArrayRead(xx,&x);
1749:   VecGetArrayPair(yy,zz,&y,&z);

1751:   idx = a->j;
1752:   v   = a->a;
1753:   if (usecprow) {
1754:     if (zz != yy) {
1755:       PetscArraycpy(z,y,mbs);
1756:     }
1757:     mbs  = a->compressedrow.nrows;
1758:     ii   = a->compressedrow.i;
1759:     ridx = a->compressedrow.rindex;
1760:   } else {
1761:     ii = a->i;
1762:   }

1764:   for (i=0; i<mbs; i++) {
1765:     n = ii[1] - ii[0];
1766:     ii++;
1767:     if (!usecprow) {
1768:       sum = y[i];
1769:     } else {
1770:       sum = y[ridx[i]];
1771:     }
1772:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1773:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1774:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1775:     v   += n;
1776:     idx += n;
1777:     if (usecprow) {
1778:       z[ridx[i]] = sum;
1779:     } else {
1780:       z[i] = sum;
1781:     }
1782:   }
1783:   VecRestoreArrayRead(xx,&x);
1784:   VecRestoreArrayPair(yy,zz,&y,&z);
1785:   PetscLogFlops(2.0*a->nz);
1786:   return(0);
1787: }

1789: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1790: {
1791:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1792:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1793:   const PetscScalar *x,*xb;
1794:   PetscScalar       x1,x2,*yarray,*zarray;
1795:   const MatScalar   *v;
1796:   PetscErrorCode    ierr;
1797:   PetscInt          mbs = a->mbs,i,n,j;
1798:   const PetscInt    *idx,*ii,*ridx = NULL;
1799:   PetscBool         usecprow = a->compressedrow.use;

1802:   VecGetArrayRead(xx,&x);
1803:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1805:   idx = a->j;
1806:   v   = a->a;
1807:   if (usecprow) {
1808:     if (zz != yy) {
1809:       PetscArraycpy(zarray,yarray,2*mbs);
1810:     }
1811:     mbs  = a->compressedrow.nrows;
1812:     ii   = a->compressedrow.i;
1813:     ridx = a->compressedrow.rindex;
1814:   } else {
1815:     ii = a->i;
1816:     y  = yarray;
1817:     z  = zarray;
1818:   }

1820:   for (i=0; i<mbs; i++) {
1821:     n = ii[1] - ii[0]; ii++;
1822:     if (usecprow) {
1823:       z = zarray + 2*ridx[i];
1824:       y = yarray + 2*ridx[i];
1825:     }
1826:     sum1 = y[0]; sum2 = y[1];
1827:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1828:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1829:     for (j=0; j<n; j++) {
1830:       xb = x + 2*(*idx++);
1831:       x1 = xb[0];
1832:       x2 = xb[1];

1834:       sum1 += v[0]*x1 + v[2]*x2;
1835:       sum2 += v[1]*x1 + v[3]*x2;
1836:       v    += 4;
1837:     }
1838:     z[0] = sum1; z[1] = sum2;
1839:     if (!usecprow) {
1840:       z += 2; y += 2;
1841:     }
1842:   }
1843:   VecRestoreArrayRead(xx,&x);
1844:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1845:   PetscLogFlops(4.0*a->nz);
1846:   return(0);
1847: }

1849: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1850: {
1851:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1852:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1853:   const PetscScalar *x,*xb;
1854:   const MatScalar   *v;
1855:   PetscErrorCode    ierr;
1856:   PetscInt          mbs = a->mbs,i,j,n;
1857:   const PetscInt    *idx,*ii,*ridx = NULL;
1858:   PetscBool         usecprow = a->compressedrow.use;

1861:   VecGetArrayRead(xx,&x);
1862:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1864:   idx = a->j;
1865:   v   = a->a;
1866:   if (usecprow) {
1867:     if (zz != yy) {
1868:       PetscArraycpy(zarray,yarray,3*mbs);
1869:     }
1870:     mbs  = a->compressedrow.nrows;
1871:     ii   = a->compressedrow.i;
1872:     ridx = a->compressedrow.rindex;
1873:   } else {
1874:     ii = a->i;
1875:     y  = yarray;
1876:     z  = zarray;
1877:   }

1879:   for (i=0; i<mbs; i++) {
1880:     n = ii[1] - ii[0]; ii++;
1881:     if (usecprow) {
1882:       z = zarray + 3*ridx[i];
1883:       y = yarray + 3*ridx[i];
1884:     }
1885:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1886:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1887:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1888:     for (j=0; j<n; j++) {
1889:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1890:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1891:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1892:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1893:       v    += 9;
1894:     }
1895:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1896:     if (!usecprow) {
1897:       z += 3; y += 3;
1898:     }
1899:   }
1900:   VecRestoreArrayRead(xx,&x);
1901:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1902:   PetscLogFlops(18.0*a->nz);
1903:   return(0);
1904: }

1906: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1907: {
1908:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1909:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1910:   const PetscScalar *x,*xb;
1911:   const MatScalar   *v;
1912:   PetscErrorCode    ierr;
1913:   PetscInt          mbs = a->mbs,i,j,n;
1914:   const PetscInt    *idx,*ii,*ridx=NULL;
1915:   PetscBool         usecprow=a->compressedrow.use;

1918:   VecGetArrayRead(xx,&x);
1919:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1921:   idx = a->j;
1922:   v   = a->a;
1923:   if (usecprow) {
1924:     if (zz != yy) {
1925:       PetscArraycpy(zarray,yarray,4*mbs);
1926:     }
1927:     mbs  = a->compressedrow.nrows;
1928:     ii   = a->compressedrow.i;
1929:     ridx = a->compressedrow.rindex;
1930:   } else {
1931:     ii = a->i;
1932:     y  = yarray;
1933:     z  = zarray;
1934:   }

1936:   for (i=0; i<mbs; i++) {
1937:     n = ii[1] - ii[0]; ii++;
1938:     if (usecprow) {
1939:       z = zarray + 4*ridx[i];
1940:       y = yarray + 4*ridx[i];
1941:     }
1942:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1943:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1944:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1945:     for (j=0; j<n; j++) {
1946:       xb    = x + 4*(*idx++);
1947:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1948:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1949:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1950:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1951:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1952:       v    += 16;
1953:     }
1954:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1955:     if (!usecprow) {
1956:       z += 4; y += 4;
1957:     }
1958:   }
1959:   VecRestoreArrayRead(xx,&x);
1960:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1961:   PetscLogFlops(32.0*a->nz);
1962:   return(0);
1963: }

1965: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1966: {
1967:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1968:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1969:   const PetscScalar *x,*xb;
1970:   PetscScalar       *yarray,*zarray;
1971:   const MatScalar   *v;
1972:   PetscErrorCode    ierr;
1973:   PetscInt          mbs = a->mbs,i,j,n;
1974:   const PetscInt    *idx,*ii,*ridx = NULL;
1975:   PetscBool         usecprow=a->compressedrow.use;

1978:   VecGetArrayRead(xx,&x);
1979:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1981:   idx = a->j;
1982:   v   = a->a;
1983:   if (usecprow) {
1984:     if (zz != yy) {
1985:       PetscArraycpy(zarray,yarray,5*mbs);
1986:     }
1987:     mbs  = a->compressedrow.nrows;
1988:     ii   = a->compressedrow.i;
1989:     ridx = a->compressedrow.rindex;
1990:   } else {
1991:     ii = a->i;
1992:     y  = yarray;
1993:     z  = zarray;
1994:   }

1996:   for (i=0; i<mbs; i++) {
1997:     n = ii[1] - ii[0]; ii++;
1998:     if (usecprow) {
1999:       z = zarray + 5*ridx[i];
2000:       y = yarray + 5*ridx[i];
2001:     }
2002:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2003:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2004:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2005:     for (j=0; j<n; j++) {
2006:       xb    = x + 5*(*idx++);
2007:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2008:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2009:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2010:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2011:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2012:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2013:       v    += 25;
2014:     }
2015:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2016:     if (!usecprow) {
2017:       z += 5; y += 5;
2018:     }
2019:   }
2020:   VecRestoreArrayRead(xx,&x);
2021:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2022:   PetscLogFlops(50.0*a->nz);
2023:   return(0);
2024: }

2026: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2027: {
2028:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2029:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2030:   const PetscScalar *x,*xb;
2031:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2032:   const MatScalar   *v;
2033:   PetscErrorCode    ierr;
2034:   PetscInt          mbs = a->mbs,i,j,n;
2035:   const PetscInt    *idx,*ii,*ridx=NULL;
2036:   PetscBool         usecprow=a->compressedrow.use;

2039:   VecGetArrayRead(xx,&x);
2040:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2042:   idx = a->j;
2043:   v   = a->a;
2044:   if (usecprow) {
2045:     if (zz != yy) {
2046:       PetscArraycpy(zarray,yarray,6*mbs);
2047:     }
2048:     mbs  = a->compressedrow.nrows;
2049:     ii   = a->compressedrow.i;
2050:     ridx = a->compressedrow.rindex;
2051:   } else {
2052:     ii = a->i;
2053:     y  = yarray;
2054:     z  = zarray;
2055:   }

2057:   for (i=0; i<mbs; i++) {
2058:     n = ii[1] - ii[0]; ii++;
2059:     if (usecprow) {
2060:       z = zarray + 6*ridx[i];
2061:       y = yarray + 6*ridx[i];
2062:     }
2063:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2064:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2065:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2066:     for (j=0; j<n; j++) {
2067:       xb    = x + 6*(*idx++);
2068:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2069:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2070:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2071:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2072:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2073:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2074:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2075:       v    += 36;
2076:     }
2077:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2078:     if (!usecprow) {
2079:       z += 6; y += 6;
2080:     }
2081:   }
2082:   VecRestoreArrayRead(xx,&x);
2083:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2084:   PetscLogFlops(72.0*a->nz);
2085:   return(0);
2086: }

2088: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2089: {
2090:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2091:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2092:   const PetscScalar *x,*xb;
2093:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2094:   const MatScalar   *v;
2095:   PetscErrorCode    ierr;
2096:   PetscInt          mbs = a->mbs,i,j,n;
2097:   const PetscInt    *idx,*ii,*ridx = NULL;
2098:   PetscBool         usecprow=a->compressedrow.use;

2101:   VecGetArrayRead(xx,&x);
2102:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2104:   idx = a->j;
2105:   v   = a->a;
2106:   if (usecprow) {
2107:     if (zz != yy) {
2108:       PetscArraycpy(zarray,yarray,7*mbs);
2109:     }
2110:     mbs  = a->compressedrow.nrows;
2111:     ii   = a->compressedrow.i;
2112:     ridx = a->compressedrow.rindex;
2113:   } else {
2114:     ii = a->i;
2115:     y  = yarray;
2116:     z  = zarray;
2117:   }

2119:   for (i=0; i<mbs; i++) {
2120:     n = ii[1] - ii[0]; ii++;
2121:     if (usecprow) {
2122:       z = zarray + 7*ridx[i];
2123:       y = yarray + 7*ridx[i];
2124:     }
2125:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2126:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2127:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2128:     for (j=0; j<n; j++) {
2129:       xb    = x + 7*(*idx++);
2130:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2131:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2132:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2133:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2134:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2135:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2136:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2137:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2138:       v    += 49;
2139:     }
2140:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2141:     if (!usecprow) {
2142:       z += 7; y += 7;
2143:     }
2144:   }
2145:   VecRestoreArrayRead(xx,&x);
2146:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2147:   PetscLogFlops(98.0*a->nz);
2148:   return(0);
2149: }

2151: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2152: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2153: {
2154:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2155:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2156:   const PetscScalar *x,*xb;
2157:   const MatScalar   *v;
2158:   PetscErrorCode    ierr;
2159:   PetscInt          mbs,i,j,n;
2160:   PetscInt          k;
2161:   PetscBool         usecprow=a->compressedrow.use;
2162:   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;

2164:   __m256d a0,a1,a2,a3,a4,a5;
2165:   __m256d w0,w1,w2,w3;
2166:   __m256d z0,z1,z2;
2167:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

2170:   VecCopy(yy,zz);
2171:   VecGetArrayRead(xx,&x);
2172:   VecGetArray(zz,&zarray);

2174:   idx = a->j;
2175:   v   = a->a;
2176:   if (usecprow) {
2177:     mbs  = a->compressedrow.nrows;
2178:     ii   = a->compressedrow.i;
2179:     ridx = a->compressedrow.rindex;
2180:   } else {
2181:     mbs = a->mbs;
2182:     ii  = a->i;
2183:     z   = zarray;
2184:   }

2186:   if (!a->mult_work) {
2187:     k    = PetscMax(A->rmap->n,A->cmap->n);
2188:     PetscMalloc1(k+1,&a->mult_work);
2189:   }

2191:   work = a->mult_work;
2192:   for (i=0; i<mbs; i++) {
2193:     n           = ii[1] - ii[0]; ii++;
2194:     workt       = work;
2195:     for (j=0; j<n; j++) {
2196:       xb = x + bs*(*idx++);
2197:       for (k=0; k<bs; k++) workt[k] = xb[k];
2198:       workt += bs;
2199:     }
2200:     if (usecprow) z = zarray + bs*ridx[i];

2202:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

2204:     for (j=0; j<n; j++) {
2205:       /* first column of a */
2206:       w0 = _mm256_set1_pd(work[j*9  ]);
2207:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2208:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2209:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

2211:       /* second column of a */
2212:       w1 = _mm256_set1_pd(work[j*9+ 1]);
2213:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2214:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2215:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2217:       /* third column of a */
2218:       w2 = _mm256_set1_pd(work[j*9+ 2]);
2219:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2220:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2221:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

2223:       /* fourth column of a */
2224:       w3 = _mm256_set1_pd(work[j*9+ 3]);
2225:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2226:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2227:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

2229:       /* fifth column of a */
2230:       w0 = _mm256_set1_pd(work[j*9+ 4]);
2231:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2232:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2233:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

2235:       /* sixth column of a */
2236:       w1 = _mm256_set1_pd(work[j*9+ 5]);
2237:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2238:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2239:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2241:       /* seventh column of a */
2242:       w2 = _mm256_set1_pd(work[j*9+ 6]);
2243:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2244:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2245:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

2247:       /* eigth column of a */
2248:       w3 = _mm256_set1_pd(work[j*9+ 7]);
2249:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2250:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2251:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

2253:       /* ninth column of a */
2254:       w0 = _mm256_set1_pd(work[j*9+ 8]);
2255:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2256:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2257:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2258:     }

2260:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

2262:     v += n*bs2;
2263:     if (!usecprow) z += bs;
2264:   }
2265:   VecRestoreArrayRead(xx,&x);
2266:   VecRestoreArray(zz,&zarray);
2267:   PetscLogFlops(162.0*a->nz);
2268:   return(0);
2269: }
2270: #endif

2272: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2273: {
2274:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2275:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2276:   const PetscScalar *x,*xb;
2277:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2278:   const MatScalar   *v;
2279:   PetscErrorCode    ierr;
2280:   PetscInt          mbs = a->mbs,i,j,n;
2281:   const PetscInt    *idx,*ii,*ridx = NULL;
2282:   PetscBool         usecprow=a->compressedrow.use;

2285:   VecGetArrayRead(xx,&x);
2286:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2288:   idx = a->j;
2289:   v   = a->a;
2290:   if (usecprow) {
2291:     if (zz != yy) {
2292:       PetscArraycpy(zarray,yarray,7*mbs);
2293:     }
2294:     mbs  = a->compressedrow.nrows;
2295:     ii   = a->compressedrow.i;
2296:     ridx = a->compressedrow.rindex;
2297:   } else {
2298:     ii = a->i;
2299:     y  = yarray;
2300:     z  = zarray;
2301:   }

2303:   for (i=0; i<mbs; i++) {
2304:     n = ii[1] - ii[0]; ii++;
2305:     if (usecprow) {
2306:       z = zarray + 11*ridx[i];
2307:       y = yarray + 11*ridx[i];
2308:     }
2309:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2310:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2311:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2312:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2313:     for (j=0; j<n; j++) {
2314:       xb    = x + 11*(*idx++);
2315:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2316:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2317:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2318:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2319:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2320:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2321:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2322:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2323:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2324:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2325:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2326:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2327:       v    += 121;
2328:     }
2329:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2330:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2331:     if (!usecprow) {
2332:       z += 11; y += 11;
2333:     }
2334:   }
2335:   VecRestoreArrayRead(xx,&x);
2336:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2337:   PetscLogFlops(242.0*a->nz);
2338:   return(0);
2339: }

2341: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2342: {
2343:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2344:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2345:   const PetscScalar *x,*xb;
2346:   const MatScalar   *v;
2347:   PetscErrorCode    ierr;
2348:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2349:   PetscInt          ncols,k;
2350:   const PetscInt    *ridx = NULL,*idx,*ii;
2351:   PetscBool         usecprow = a->compressedrow.use;

2354:   VecCopy(yy,zz);
2355:   VecGetArrayRead(xx,&x);
2356:   VecGetArray(zz,&zarray);

2358:   idx = a->j;
2359:   v   = a->a;
2360:   if (usecprow) {
2361:     mbs  = a->compressedrow.nrows;
2362:     ii   = a->compressedrow.i;
2363:     ridx = a->compressedrow.rindex;
2364:   } else {
2365:     mbs = a->mbs;
2366:     ii  = a->i;
2367:     z   = zarray;
2368:   }

2370:   if (!a->mult_work) {
2371:     k    = PetscMax(A->rmap->n,A->cmap->n);
2372:     PetscMalloc1(k+1,&a->mult_work);
2373:   }
2374:   work = a->mult_work;
2375:   for (i=0; i<mbs; i++) {
2376:     n     = ii[1] - ii[0]; ii++;
2377:     ncols = n*bs;
2378:     workt = work;
2379:     for (j=0; j<n; j++) {
2380:       xb = x + bs*(*idx++);
2381:       for (k=0; k<bs; k++) workt[k] = xb[k];
2382:       workt += bs;
2383:     }
2384:     if (usecprow) z = zarray + bs*ridx[i];
2385:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2386:     v += n*bs2;
2387:     if (!usecprow) z += bs;
2388:   }
2389:   VecRestoreArrayRead(xx,&x);
2390:   VecRestoreArray(zz,&zarray);
2391:   PetscLogFlops(2.0*a->nz*bs2);
2392:   return(0);
2393: }

2395: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2396: {
2397:   PetscScalar    zero = 0.0;

2401:   VecSet(zz,zero);
2402:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2403:   return(0);
2404: }

2406: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2407: {
2408:   PetscScalar    zero = 0.0;

2412:   VecSet(zz,zero);
2413:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2414:   return(0);
2415: }

2417: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2418: {
2419:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2420:   PetscScalar       *z,x1,x2,x3,x4,x5;
2421:   const PetscScalar *x,*xb = NULL;
2422:   const MatScalar   *v;
2423:   PetscErrorCode    ierr;
2424:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2425:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2426:   Mat_CompressedRow cprow = a->compressedrow;
2427:   PetscBool         usecprow = cprow.use;

2430:   if (yy != zz) { VecCopy(yy,zz); }
2431:   VecGetArrayRead(xx,&x);
2432:   VecGetArray(zz,&z);

2434:   idx = a->j;
2435:   v   = a->a;
2436:   if (usecprow) {
2437:     mbs  = cprow.nrows;
2438:     ii   = cprow.i;
2439:     ridx = cprow.rindex;
2440:   } else {
2441:     mbs=a->mbs;
2442:     ii = a->i;
2443:     xb = x;
2444:   }

2446:   switch (bs) {
2447:   case 1:
2448:     for (i=0; i<mbs; i++) {
2449:       if (usecprow) xb = x + ridx[i];
2450:       x1 = xb[0];
2451:       ib = idx + ii[0];
2452:       n  = ii[1] - ii[0]; ii++;
2453:       for (j=0; j<n; j++) {
2454:         rval     = ib[j];
2455:         z[rval] += PetscConj(*v) * x1;
2456:         v++;
2457:       }
2458:       if (!usecprow) xb++;
2459:     }
2460:     break;
2461:   case 2:
2462:     for (i=0; i<mbs; i++) {
2463:       if (usecprow) xb = x + 2*ridx[i];
2464:       x1 = xb[0]; x2 = xb[1];
2465:       ib = idx + ii[0];
2466:       n  = ii[1] - ii[0]; ii++;
2467:       for (j=0; j<n; j++) {
2468:         rval       = ib[j]*2;
2469:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2470:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2471:         v         += 4;
2472:       }
2473:       if (!usecprow) xb += 2;
2474:     }
2475:     break;
2476:   case 3:
2477:     for (i=0; i<mbs; i++) {
2478:       if (usecprow) xb = x + 3*ridx[i];
2479:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2480:       ib = idx + ii[0];
2481:       n  = ii[1] - ii[0]; ii++;
2482:       for (j=0; j<n; j++) {
2483:         rval       = ib[j]*3;
2484:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2485:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2486:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2487:         v         += 9;
2488:       }
2489:       if (!usecprow) xb += 3;
2490:     }
2491:     break;
2492:   case 4:
2493:     for (i=0; i<mbs; i++) {
2494:       if (usecprow) xb = x + 4*ridx[i];
2495:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2496:       ib = idx + ii[0];
2497:       n  = ii[1] - ii[0]; ii++;
2498:       for (j=0; j<n; j++) {
2499:         rval       = ib[j]*4;
2500:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2501:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2502:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2503:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2504:         v         += 16;
2505:       }
2506:       if (!usecprow) xb += 4;
2507:     }
2508:     break;
2509:   case 5:
2510:     for (i=0; i<mbs; i++) {
2511:       if (usecprow) xb = x + 5*ridx[i];
2512:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2513:       x4 = xb[3]; x5 = xb[4];
2514:       ib = idx + ii[0];
2515:       n  = ii[1] - ii[0]; ii++;
2516:       for (j=0; j<n; j++) {
2517:         rval       = ib[j]*5;
2518:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2519:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2520:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2521:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2522:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2523:         v         += 25;
2524:       }
2525:       if (!usecprow) xb += 5;
2526:     }
2527:     break;
2528:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2529:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2530: #if 0
2531:     {
2532:       PetscInt          ncols,k,bs2=a->bs2;
2533:       PetscScalar       *work,*workt,zb;
2534:       const PetscScalar *xtmp;
2535:       if (!a->mult_work) {
2536:         k    = PetscMax(A->rmap->n,A->cmap->n);
2537:         PetscMalloc1(k+1,&a->mult_work);
2538:       }
2539:       work = a->mult_work;
2540:       xtmp = x;
2541:       for (i=0; i<mbs; i++) {
2542:         n     = ii[1] - ii[0]; ii++;
2543:         ncols = n*bs;
2544:         PetscArrayzero(work,ncols);
2545:         if (usecprow) xtmp = x + bs*ridx[i];
2546:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2547:         v += n*bs2;
2548:         if (!usecprow) xtmp += bs;
2549:         workt = work;
2550:         for (j=0; j<n; j++) {
2551:           zb = z + bs*(*idx++);
2552:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2553:           workt += bs;
2554:         }
2555:       }
2556:     }
2557: #endif
2558:   }
2559:   VecRestoreArrayRead(xx,&x);
2560:   VecRestoreArray(zz,&z);
2561:   PetscLogFlops(2.0*a->nz*a->bs2);
2562:   return(0);
2563: }

2565: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2566: {
2567:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2568:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2569:   const PetscScalar *x,*xb = NULL;
2570:   const MatScalar   *v;
2571:   PetscErrorCode    ierr;
2572:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2573:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2574:   Mat_CompressedRow cprow   = a->compressedrow;
2575:   PetscBool         usecprow=cprow.use;

2578:   if (yy != zz) { VecCopy(yy,zz); }
2579:   VecGetArrayRead(xx,&x);
2580:   VecGetArray(zz,&z);

2582:   idx = a->j;
2583:   v   = a->a;
2584:   if (usecprow) {
2585:     mbs  = cprow.nrows;
2586:     ii   = cprow.i;
2587:     ridx = cprow.rindex;
2588:   } else {
2589:     mbs=a->mbs;
2590:     ii = a->i;
2591:     xb = x;
2592:   }

2594:   switch (bs) {
2595:   case 1:
2596:     for (i=0; i<mbs; i++) {
2597:       if (usecprow) xb = x + ridx[i];
2598:       x1 = xb[0];
2599:       ib = idx + ii[0];
2600:       n  = ii[1] - ii[0]; ii++;
2601:       for (j=0; j<n; j++) {
2602:         rval     = ib[j];
2603:         z[rval] += *v * x1;
2604:         v++;
2605:       }
2606:       if (!usecprow) xb++;
2607:     }
2608:     break;
2609:   case 2:
2610:     for (i=0; i<mbs; i++) {
2611:       if (usecprow) xb = x + 2*ridx[i];
2612:       x1 = xb[0]; x2 = xb[1];
2613:       ib = idx + ii[0];
2614:       n  = ii[1] - ii[0]; ii++;
2615:       for (j=0; j<n; j++) {
2616:         rval       = ib[j]*2;
2617:         z[rval++] += v[0]*x1 + v[1]*x2;
2618:         z[rval++] += v[2]*x1 + v[3]*x2;
2619:         v         += 4;
2620:       }
2621:       if (!usecprow) xb += 2;
2622:     }
2623:     break;
2624:   case 3:
2625:     for (i=0; i<mbs; i++) {
2626:       if (usecprow) xb = x + 3*ridx[i];
2627:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2628:       ib = idx + ii[0];
2629:       n  = ii[1] - ii[0]; ii++;
2630:       for (j=0; j<n; j++) {
2631:         rval       = ib[j]*3;
2632:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2633:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2634:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2635:         v         += 9;
2636:       }
2637:       if (!usecprow) xb += 3;
2638:     }
2639:     break;
2640:   case 4:
2641:     for (i=0; i<mbs; i++) {
2642:       if (usecprow) xb = x + 4*ridx[i];
2643:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2644:       ib = idx + ii[0];
2645:       n  = ii[1] - ii[0]; ii++;
2646:       for (j=0; j<n; j++) {
2647:         rval       = ib[j]*4;
2648:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2649:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2650:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2651:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2652:         v         += 16;
2653:       }
2654:       if (!usecprow) xb += 4;
2655:     }
2656:     break;
2657:   case 5:
2658:     for (i=0; i<mbs; i++) {
2659:       if (usecprow) xb = x + 5*ridx[i];
2660:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2661:       x4 = xb[3]; x5 = xb[4];
2662:       ib = idx + ii[0];
2663:       n  = ii[1] - ii[0]; ii++;
2664:       for (j=0; j<n; j++) {
2665:         rval       = ib[j]*5;
2666:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2667:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2668:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2669:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2670:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2671:         v         += 25;
2672:       }
2673:       if (!usecprow) xb += 5;
2674:     }
2675:     break;
2676:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2677:     PetscInt          ncols,k;
2678:     PetscScalar       *work,*workt;
2679:     const PetscScalar *xtmp;
2680:     if (!a->mult_work) {
2681:       k    = PetscMax(A->rmap->n,A->cmap->n);
2682:       PetscMalloc1(k+1,&a->mult_work);
2683:     }
2684:     work = a->mult_work;
2685:     xtmp = x;
2686:     for (i=0; i<mbs; i++) {
2687:       n     = ii[1] - ii[0]; ii++;
2688:       ncols = n*bs;
2689:       PetscArrayzero(work,ncols);
2690:       if (usecprow) xtmp = x + bs*ridx[i];
2691:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2692:       v += n*bs2;
2693:       if (!usecprow) xtmp += bs;
2694:       workt = work;
2695:       for (j=0; j<n; j++) {
2696:         zb = z + bs*(*idx++);
2697:         for (k=0; k<bs; k++) zb[k] += workt[k];
2698:         workt += bs;
2699:       }
2700:     }
2701:     }
2702:   }
2703:   VecRestoreArrayRead(xx,&x);
2704:   VecRestoreArray(zz,&z);
2705:   PetscLogFlops(2.0*a->nz*a->bs2);
2706:   return(0);
2707: }

2709: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2710: {
2711:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2712:   PetscInt       totalnz = a->bs2*a->nz;
2713:   PetscScalar    oalpha  = alpha;
2715:   PetscBLASInt   one = 1,tnz;

2718:   PetscBLASIntCast(totalnz,&tnz);
2719:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2720:   PetscLogFlops(totalnz);
2721:   return(0);
2722: }

2724: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2725: {
2727:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2728:   MatScalar      *v  = a->a;
2729:   PetscReal      sum = 0.0;
2730:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2733:   if (type == NORM_FROBENIUS) {
2734: #if defined(PETSC_USE_REAL___FP16)
2735:     PetscBLASInt one = 1,cnt = bs2*nz;
2736:     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2737: #else
2738:     for (i=0; i<bs2*nz; i++) {
2739:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2740:     }
2741: #endif
2742:     *norm = PetscSqrtReal(sum);
2743:     PetscLogFlops(2.0*bs2*nz);
2744:   } else if (type == NORM_1) { /* maximum column sum */
2745:     PetscReal *tmp;
2746:     PetscInt  *bcol = a->j;
2747:     PetscCalloc1(A->cmap->n+1,&tmp);
2748:     for (i=0; i<nz; i++) {
2749:       for (j=0; j<bs; j++) {
2750:         k1 = bs*(*bcol) + j; /* column index */
2751:         for (k=0; k<bs; k++) {
2752:           tmp[k1] += PetscAbsScalar(*v); v++;
2753:         }
2754:       }
2755:       bcol++;
2756:     }
2757:     *norm = 0.0;
2758:     for (j=0; j<A->cmap->n; j++) {
2759:       if (tmp[j] > *norm) *norm = tmp[j];
2760:     }
2761:     PetscFree(tmp);
2762:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2763:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2764:     *norm = 0.0;
2765:     for (k=0; k<bs; k++) {
2766:       for (j=0; j<a->mbs; j++) {
2767:         v   = a->a + bs2*a->i[j] + k;
2768:         sum = 0.0;
2769:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2770:           for (k1=0; k1<bs; k1++) {
2771:             sum += PetscAbsScalar(*v);
2772:             v   += bs;
2773:           }
2774:         }
2775:         if (sum > *norm) *norm = sum;
2776:       }
2777:     }
2778:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2779:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2780:   return(0);
2781: }

2783: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2784: {
2785:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2789:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2790:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2791:     *flg = PETSC_FALSE;
2792:     return(0);
2793:   }

2795:   /* if the a->i are the same */
2796:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2797:   if (!*flg) return(0);

2799:   /* if a->j are the same */
2800:   PetscArraycmp(a->j,b->j,a->nz,flg);
2801:   if (!*flg) return(0);

2803:   /* if a->a are the same */
2804:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2805:   return(0);

2807: }

2809: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2810: {
2811:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2813:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2814:   PetscScalar    *x,zero = 0.0;
2815:   MatScalar      *aa,*aa_j;

2818:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2819:   bs   = A->rmap->bs;
2820:   aa   = a->a;
2821:   ai   = a->i;
2822:   aj   = a->j;
2823:   ambs = a->mbs;
2824:   bs2  = a->bs2;

2826:   VecSet(v,zero);
2827:   VecGetArray(v,&x);
2828:   VecGetLocalSize(v,&n);
2829:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2830:   for (i=0; i<ambs; i++) {
2831:     for (j=ai[i]; j<ai[i+1]; j++) {
2832:       if (aj[j] == i) {
2833:         row  = i*bs;
2834:         aa_j = aa+j*bs2;
2835:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2836:         break;
2837:       }
2838:     }
2839:   }
2840:   VecRestoreArray(v,&x);
2841:   return(0);
2842: }

2844: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2845: {
2846:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2847:   const PetscScalar *l,*r,*li,*ri;
2848:   PetscScalar       x;
2849:   MatScalar         *aa, *v;
2850:   PetscErrorCode    ierr;
2851:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2852:   const PetscInt    *ai,*aj;

2855:   ai  = a->i;
2856:   aj  = a->j;
2857:   aa  = a->a;
2858:   m   = A->rmap->n;
2859:   n   = A->cmap->n;
2860:   bs  = A->rmap->bs;
2861:   mbs = a->mbs;
2862:   bs2 = a->bs2;
2863:   if (ll) {
2864:     VecGetArrayRead(ll,&l);
2865:     VecGetLocalSize(ll,&lm);
2866:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2867:     for (i=0; i<mbs; i++) { /* for each block row */
2868:       M  = ai[i+1] - ai[i];
2869:       li = l + i*bs;
2870:       v  = aa + bs2*ai[i];
2871:       for (j=0; j<M; j++) { /* for each block */
2872:         for (k=0; k<bs2; k++) {
2873:           (*v++) *= li[k%bs];
2874:         }
2875:       }
2876:     }
2877:     VecRestoreArrayRead(ll,&l);
2878:     PetscLogFlops(a->nz);
2879:   }

2881:   if (rr) {
2882:     VecGetArrayRead(rr,&r);
2883:     VecGetLocalSize(rr,&rn);
2884:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2885:     for (i=0; i<mbs; i++) { /* for each block row */
2886:       iai = ai[i];
2887:       M   = ai[i+1] - iai;
2888:       v   = aa + bs2*iai;
2889:       for (j=0; j<M; j++) { /* for each block */
2890:         ri = r + bs*aj[iai+j];
2891:         for (k=0; k<bs; k++) {
2892:           x = ri[k];
2893:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2894:           v += bs;
2895:         }
2896:       }
2897:     }
2898:     VecRestoreArrayRead(rr,&r);
2899:     PetscLogFlops(a->nz);
2900:   }
2901:   return(0);
2902: }

2904: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2905: {
2906:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2909:   info->block_size   = a->bs2;
2910:   info->nz_allocated = a->bs2*a->maxnz;
2911:   info->nz_used      = a->bs2*a->nz;
2912:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2913:   info->assemblies   = A->num_ass;
2914:   info->mallocs      = A->info.mallocs;
2915:   info->memory       = ((PetscObject)A)->mem;
2916:   if (A->factortype) {
2917:     info->fill_ratio_given  = A->info.fill_ratio_given;
2918:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2919:     info->factor_mallocs    = A->info.factor_mallocs;
2920:   } else {
2921:     info->fill_ratio_given  = 0;
2922:     info->fill_ratio_needed = 0;
2923:     info->factor_mallocs    = 0;
2924:   }
2925:   return(0);
2926: }

2928: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2929: {
2930:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2934:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2935:   return(0);
2936: }

2938: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2939: {

2943:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2944:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2945:   return(0);
2946: }

2948: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2949: {
2950:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2951:   PetscScalar       *z = NULL,sum1;
2952:   const PetscScalar *xb;
2953:   PetscScalar       x1;
2954:   const MatScalar   *v,*vv;
2955:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2956:   PetscBool         usecprow=a->compressedrow.use;

2959:   idx = a->j;
2960:   v   = a->a;
2961:   if (usecprow) {
2962:     mbs  = a->compressedrow.nrows;
2963:     ii   = a->compressedrow.i;
2964:     ridx = a->compressedrow.rindex;
2965:   } else {
2966:     mbs = a->mbs;
2967:     ii  = a->i;
2968:     z   = c;
2969:   }

2971:   for (i=0; i<mbs; i++) {
2972:     n           = ii[1] - ii[0]; ii++;
2973:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2974:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2975:     if (usecprow) z = c + ridx[i];
2976:     jj = idx;
2977:     vv = v;
2978:     for (k=0; k<cn; k++) {
2979:       idx = jj;
2980:       v = vv;
2981:       sum1    = 0.0;
2982:       for (j=0; j<n; j++) {
2983:         xb    = b + (*idx++); x1 = xb[0+k*bm];
2984:         sum1 += v[0]*x1;
2985:         v    += 1;
2986:       }
2987:       z[0+k*cm] = sum1;
2988:     }
2989:     if (!usecprow) z += 1;
2990:   }
2991:   return(0);
2992: }

2994: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2995: {
2996:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2997:   PetscScalar       *z = NULL,sum1,sum2;
2998:   const PetscScalar *xb;
2999:   PetscScalar       x1,x2;
3000:   const MatScalar   *v,*vv;
3001:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3002:   PetscBool         usecprow=a->compressedrow.use;

3005:   idx = a->j;
3006:   v   = a->a;
3007:   if (usecprow) {
3008:     mbs  = a->compressedrow.nrows;
3009:     ii   = a->compressedrow.i;
3010:     ridx = a->compressedrow.rindex;
3011:   } else {
3012:     mbs = a->mbs;
3013:     ii  = a->i;
3014:     z   = c;
3015:   }

3017:   for (i=0; i<mbs; i++) {
3018:     n           = ii[1] - ii[0]; ii++;
3019:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3020:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3021:     if (usecprow) z = c + 2*ridx[i];
3022:     jj = idx;
3023:     vv = v;
3024:     for (k=0; k<cn; k++) {
3025:       idx = jj;
3026:       v = vv;
3027:       sum1    = 0.0; sum2 = 0.0;
3028:       for (j=0; j<n; j++) {
3029:         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3030:         sum1 += v[0]*x1 + v[2]*x2;
3031:         sum2 += v[1]*x1 + v[3]*x2;
3032:         v    += 4;
3033:       }
3034:       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3035:     }
3036:     if (!usecprow) z += 2;
3037:   }
3038:   return(0);
3039: }

3041: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3042: {
3043:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3044:   PetscScalar       *z = NULL,sum1,sum2,sum3;
3045:   const PetscScalar *xb;
3046:   PetscScalar       x1,x2,x3;
3047:   const MatScalar   *v,*vv;
3048:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3049:   PetscBool         usecprow=a->compressedrow.use;

3052:   idx = a->j;
3053:   v   = a->a;
3054:   if (usecprow) {
3055:     mbs  = a->compressedrow.nrows;
3056:     ii   = a->compressedrow.i;
3057:     ridx = a->compressedrow.rindex;
3058:   } else {
3059:     mbs = a->mbs;
3060:     ii  = a->i;
3061:     z   = c;
3062:   }

3064:   for (i=0; i<mbs; i++) {
3065:     n           = ii[1] - ii[0]; ii++;
3066:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3067:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3068:     if (usecprow) z = c + 3*ridx[i];
3069:     jj = idx;
3070:     vv = v;
3071:     for (k=0; k<cn; k++) {
3072:       idx = jj;
3073:       v = vv;
3074:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3075:       for (j=0; j<n; j++) {
3076:         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3077:         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3078:         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3079:         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3080:         v    += 9;
3081:       }
3082:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3083:     }
3084:     if (!usecprow) z += 3;
3085:   }
3086:   return(0);
3087: }

3089: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3090: {
3091:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3092:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3093:   const PetscScalar *xb;
3094:   PetscScalar       x1,x2,x3,x4;
3095:   const MatScalar   *v,*vv;
3096:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3097:   PetscBool         usecprow=a->compressedrow.use;

3100:   idx = a->j;
3101:   v   = a->a;
3102:   if (usecprow) {
3103:     mbs  = a->compressedrow.nrows;
3104:     ii   = a->compressedrow.i;
3105:     ridx = a->compressedrow.rindex;
3106:   } else {
3107:     mbs = a->mbs;
3108:     ii  = a->i;
3109:     z   = c;
3110:   }

3112:   for (i=0; i<mbs; i++) {
3113:     n           = ii[1] - ii[0]; ii++;
3114:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3115:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3116:     if (usecprow) z = c + 4*ridx[i];
3117:     jj = idx;
3118:     vv = v;
3119:     for (k=0; k<cn; k++) {
3120:       idx = jj;
3121:       v = vv;
3122:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3123:       for (j=0; j<n; j++) {
3124:         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3125:         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3126:         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3127:         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3128:         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3129:         v    += 16;
3130:       }
3131:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3132:     }
3133:     if (!usecprow) z += 4;
3134:   }
3135:   return(0);
3136: }

3138: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3139: {
3140:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3141:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3142:   const PetscScalar *xb;
3143:   PetscScalar       x1,x2,x3,x4,x5;
3144:   const MatScalar   *v,*vv;
3145:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3146:   PetscBool         usecprow=a->compressedrow.use;

3149:   idx = a->j;
3150:   v   = a->a;
3151:   if (usecprow) {
3152:     mbs  = a->compressedrow.nrows;
3153:     ii   = a->compressedrow.i;
3154:     ridx = a->compressedrow.rindex;
3155:   } else {
3156:     mbs = a->mbs;
3157:     ii  = a->i;
3158:     z   = c;
3159:   }

3161:   for (i=0; i<mbs; i++) {
3162:     n           = ii[1] - ii[0]; ii++;
3163:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3164:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3165:     if (usecprow) z = c + 5*ridx[i];
3166:     jj = idx;
3167:     vv = v;
3168:     for (k=0; k<cn; k++) {
3169:       idx = jj;
3170:       v = vv;
3171:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3172:       for (j=0; j<n; j++) {
3173:         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3174:         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3175:         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3176:         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3177:         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3178:         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3179:         v    += 25;
3180:       }
3181:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3182:     }
3183:     if (!usecprow) z += 5;
3184:   }
3185:   return(0);
3186: }

3188: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3189: {
3190:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3191:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3192:   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3193:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3194:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3195:   PetscBLASInt      bbs,bcn,bbm,bcm;
3196:   PetscScalar       *z = NULL;
3197:   PetscScalar       *c,*b;
3198:   const MatScalar   *v;
3199:   const PetscInt    *idx,*ii,*ridx=NULL;
3200:   PetscScalar       _DZero=0.0,_DOne=1.0;
3201:   PetscBool         usecprow=a->compressedrow.use;
3202:   PetscErrorCode    ierr;

3205:   if (!cm || !cn) return(0);
3206:   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3207:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3208:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3209:   b = bd->v;
3210:   if (a->nonzerorowcnt != A->rmap->n) {
3211:     MatZeroEntries(C);
3212:   }
3213:   MatDenseGetArray(C,&c);
3214:   switch (bs) {
3215:   case 1:
3216:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3217:     break;
3218:   case 2:
3219:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3220:     break;
3221:   case 3:
3222:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3223:     break;
3224:   case 4:
3225:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3226:     break;
3227:   case 5:
3228:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3229:     break;
3230:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3231:     PetscBLASIntCast(bs,&bbs);
3232:     PetscBLASIntCast(cn,&bcn);
3233:     PetscBLASIntCast(bm,&bbm);
3234:     PetscBLASIntCast(cm,&bcm);
3235:     idx = a->j;
3236:     v   = a->a;
3237:     if (usecprow) {
3238:       mbs  = a->compressedrow.nrows;
3239:       ii   = a->compressedrow.i;
3240:       ridx = a->compressedrow.rindex;
3241:     } else {
3242:       mbs = a->mbs;
3243:       ii  = a->i;
3244:       z   = c;
3245:     }
3246:     for (i=0; i<mbs; i++) {
3247:       n = ii[1] - ii[0]; ii++;
3248:       if (usecprow) z = c + bs*ridx[i];
3249:       if (n) {
3250:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3251:         v += bs2;
3252:       }
3253:       for (j=1; j<n; j++) {
3254:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3255:         v += bs2;
3256:       }
3257:       if (!usecprow) z += bs;
3258:     }
3259:   }
3260:   MatDenseRestoreArray(C,&c);
3261:   PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3262:   return(0);
3263: }