Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8: #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 15:   const PetscInt *idx;
 16:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 17:   PetscBT        table;

 19:   m  = a->mbs;
 20:   ai = a->i;
 21:   aj = a->j;
 22:   bs = A->rmap->bs;


 26:   PetscBTCreate(m,&table);
 27:   PetscMalloc1(m+1,&nidx);
 28:   PetscMalloc1(A->rmap->N+1,&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);

 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n; ++j) {
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 43:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(&is[i]);

 48:     k = 0;
 49:     for (j=0; j<ov; j++) { /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end; l++) {
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 66:   }
 67:   PetscBTDestroy(&table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return 0;
 71: }

 73: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 74: {
 75:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 76:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 77:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 78:   const PetscInt *irow,*icol;
 79:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 80:   PetscInt       *aj = a->j,*ai = a->i;
 81:   MatScalar      *mat_a;
 82:   Mat            C;
 83:   PetscBool      flag;

 85:   ISGetIndices(isrow,&irow);
 86:   ISGetIndices(iscol,&icol);
 87:   ISGetLocalSize(isrow,&nrows);
 88:   ISGetLocalSize(iscol,&ncols);

 90:   PetscCalloc1(1+oldcols,&smap);
 91:   ssmap = smap;
 92:   PetscMalloc1(1+nrows,&lens);
 93:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 94:   /* determine lens of each row */
 95:   for (i=0; i<nrows; i++) {
 96:     kstart  = ai[irow[i]];
 97:     kend    = kstart + a->ilen[irow[i]];
 98:     lens[i] = 0;
 99:     for (k=kstart; k<kend; k++) {
100:       if (ssmap[aj[k]]) lens[i]++;
101:     }
102:   }
103:   /* Create and fill new matrix */
104:   if (scall == MAT_REUSE_MATRIX) {
105:     c = (Mat_SeqBAIJ*)((*B)->data);

108:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
110:     PetscArrayzero(c->ilen,c->mbs);
111:     C    = *B;
112:   } else {
113:     MatCreate(PetscObjectComm((PetscObject)A),&C);
114:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
115:     MatSetType(C,((PetscObject)A)->type_name);
116:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
117:   }
118:   c = (Mat_SeqBAIJ*)(C->data);
119:   for (i=0; i<nrows; i++) {
120:     row      = irow[i];
121:     kstart   = ai[row];
122:     kend     = kstart + a->ilen[row];
123:     mat_i    = c->i[i];
124:     mat_j    = c->j + mat_i;
125:     mat_a    = c->a + mat_i*bs2;
126:     mat_ilen = c->ilen + i;
127:     for (k=kstart; k<kend; k++) {
128:       if ((tcol=ssmap[a->j[k]])) {
129:         *mat_j++ = tcol - 1;
130:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
131:         mat_a   += bs2;
132:         (*mat_ilen)++;
133:       }
134:     }
135:   }
136:   /* sort */
137:   {
138:     MatScalar *work;
139:     PetscMalloc1(bs2,&work);
140:     for (i=0; i<nrows; i++) {
141:       PetscInt ilen;
142:       mat_i = c->i[i];
143:       mat_j = c->j + mat_i;
144:       mat_a = c->a + mat_i*bs2;
145:       ilen  = c->ilen[i];
146:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
147:     }
148:     PetscFree(work);
149:   }

151:   /* Free work space */
152:   ISRestoreIndices(iscol,&icol);
153:   PetscFree(smap);
154:   PetscFree(lens);
155:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

158:   ISRestoreIndices(isrow,&irow);
159:   *B   = C;
160:   return 0;
161: }

163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
167:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
168:   const PetscInt *irow,*icol;

170:   ISGetIndices(isrow,&irow);
171:   ISGetIndices(iscol,&icol);
172:   ISGetLocalSize(isrow,&nrows);
173:   ISGetLocalSize(iscol,&ncols);

175:   /* Verify if the indices corespond to each element in a block
176:    and form the IS with compressed IS */
177:   maxmnbs = PetscMax(a->mbs,a->nbs);
178:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
179:   PetscArrayzero(vary,a->mbs);
180:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
181:   for (i=0; i<a->mbs; i++) {
183:   }
184:   count = 0;
185:   for (i=0; i<nrows; i++) {
186:     j = irow[i] / bs;
187:     if ((vary[j]--)==bs) iary[count++] = j;
188:   }
189:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

191:   PetscArrayzero(vary,a->nbs);
192:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
193:   for (i=0; i<a->nbs; i++) {
195:   }
196:   count = 0;
197:   for (i=0; i<ncols; i++) {
198:     j = icol[i] / bs;
199:     if ((vary[j]--)==bs) iary[count++] = j;
200:   }
201:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
202:   ISRestoreIndices(isrow,&irow);
203:   ISRestoreIndices(iscol,&icol);
204:   PetscFree2(vary,iary);

206:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
207:   ISDestroy(&is1);
208:   ISDestroy(&is2);
209:   return 0;
210: }

212: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
213: {
214:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
215:   Mat_SubSppt    *submatj = c->submatis1;

217:   (*submatj->destroy)(C);
218:   MatDestroySubMatrix_Private(submatj);
219:   return 0;
220: }

222: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
223: {
224:   PetscInt       i;
225:   Mat            C;
226:   Mat_SeqBAIJ    *c;
227:   Mat_SubSppt    *submatj;

229:   for (i=0; i<n; i++) {
230:     C       = (*mat)[i];
231:     c       = (Mat_SeqBAIJ*)C->data;
232:     submatj = c->submatis1;
233:     if (submatj) {
234:       if (--((PetscObject)C)->refct <= 0) {
235:         (*submatj->destroy)(C);
236:         MatDestroySubMatrix_Private(submatj);
237:         PetscFree(C->defaultvectype);
238:         PetscLayoutDestroy(&C->rmap);
239:         PetscLayoutDestroy(&C->cmap);
240:         PetscHeaderDestroy(&C);
241:       }
242:     } else {
243:       MatDestroy(&C);
244:     }
245:   }

247:   /* Destroy Dummy submatrices created for reuse */
248:   MatDestroySubMatrices_Dummy(n,mat);

250:   PetscFree(*mat);
251:   return 0;
252: }

254: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
255: {
256:   PetscInt       i;

258:   if (scall == MAT_INITIAL_MATRIX) {
259:     PetscCalloc1(n+1,B);
260:   }

262:   for (i=0; i<n; i++) {
263:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
264:   }
265:   return 0;
266: }

268: /* -------------------------------------------------------*/
269: /* Should check that shapes of vectors and matrices match */
270: /* -------------------------------------------------------*/

272: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
273: {
274:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
275:   PetscScalar       *z,sum;
276:   const PetscScalar *x;
277:   const MatScalar   *v;
278:   PetscInt          mbs,i,n;
279:   const PetscInt    *idx,*ii,*ridx=NULL;
280:   PetscBool         usecprow=a->compressedrow.use;

282:   VecGetArrayRead(xx,&x);
283:   VecGetArrayWrite(zz,&z);

285:   if (usecprow) {
286:     mbs  = a->compressedrow.nrows;
287:     ii   = a->compressedrow.i;
288:     ridx = a->compressedrow.rindex;
289:     PetscArrayzero(z,a->mbs);
290:   } else {
291:     mbs = a->mbs;
292:     ii  = a->i;
293:   }

295:   for (i=0; i<mbs; i++) {
296:     n   = ii[1] - ii[0];
297:     v   = a->a + ii[0];
298:     idx = a->j + ii[0];
299:     ii++;
300:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
301:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
302:     sum = 0.0;
303:     PetscSparseDensePlusDot(sum,x,v,idx,n);
304:     if (usecprow) {
305:       z[ridx[i]] = sum;
306:     } else {
307:       z[i]       = sum;
308:     }
309:   }
310:   VecRestoreArrayRead(xx,&x);
311:   VecRestoreArrayWrite(zz,&z);
312:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
313:   return 0;
314: }

316: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
317: {
318:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
319:   PetscScalar       *z = NULL,sum1,sum2,*zarray;
320:   const PetscScalar *x,*xb;
321:   PetscScalar       x1,x2;
322:   const MatScalar   *v;
323:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
324:   PetscBool         usecprow=a->compressedrow.use;

326:   VecGetArrayRead(xx,&x);
327:   VecGetArrayWrite(zz,&zarray);

329:   idx = a->j;
330:   v   = a->a;
331:   if (usecprow) {
332:     mbs  = a->compressedrow.nrows;
333:     ii   = a->compressedrow.i;
334:     ridx = a->compressedrow.rindex;
335:     PetscArrayzero(zarray,2*a->mbs);
336:   } else {
337:     mbs = a->mbs;
338:     ii  = a->i;
339:     z   = zarray;
340:   }

342:   for (i=0; i<mbs; i++) {
343:     n           = ii[1] - ii[0]; ii++;
344:     sum1        = 0.0; sum2 = 0.0;
345:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
346:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
347:     for (j=0; j<n; j++) {
348:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
349:       sum1 += v[0]*x1 + v[2]*x2;
350:       sum2 += v[1]*x1 + v[3]*x2;
351:       v    += 4;
352:     }
353:     if (usecprow) z = zarray + 2*ridx[i];
354:     z[0] = sum1; z[1] = sum2;
355:     if (!usecprow) z += 2;
356:   }
357:   VecRestoreArrayRead(xx,&x);
358:   VecRestoreArrayWrite(zz,&zarray);
359:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
360:   return 0;
361: }

363: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
364: {
365:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
366:   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
367:   const PetscScalar *x,*xb;
368:   const MatScalar   *v;
369:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
370:   PetscBool         usecprow=a->compressedrow.use;

372: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
373: #pragma disjoint(*v,*z,*xb)
374: #endif

376:   VecGetArrayRead(xx,&x);
377:   VecGetArrayWrite(zz,&zarray);

379:   idx = a->j;
380:   v   = a->a;
381:   if (usecprow) {
382:     mbs  = a->compressedrow.nrows;
383:     ii   = a->compressedrow.i;
384:     ridx = a->compressedrow.rindex;
385:     PetscArrayzero(zarray,3*a->mbs);
386:   } else {
387:     mbs = a->mbs;
388:     ii  = a->i;
389:     z   = zarray;
390:   }

392:   for (i=0; i<mbs; i++) {
393:     n           = ii[1] - ii[0]; ii++;
394:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
395:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
396:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
397:     for (j=0; j<n; j++) {
398:       xb = x + 3*(*idx++);
399:       x1 = xb[0];
400:       x2 = xb[1];
401:       x3 = xb[2];

403:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
404:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
405:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
406:       v    += 9;
407:     }
408:     if (usecprow) z = zarray + 3*ridx[i];
409:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
410:     if (!usecprow) z += 3;
411:   }
412:   VecRestoreArrayRead(xx,&x);
413:   VecRestoreArrayWrite(zz,&zarray);
414:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
415:   return 0;
416: }

418: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
419: {
420:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
421:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
422:   const PetscScalar *x,*xb;
423:   const MatScalar   *v;
424:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
425:   PetscBool         usecprow=a->compressedrow.use;

427:   VecGetArrayRead(xx,&x);
428:   VecGetArrayWrite(zz,&zarray);

430:   idx = a->j;
431:   v   = a->a;
432:   if (usecprow) {
433:     mbs  = a->compressedrow.nrows;
434:     ii   = a->compressedrow.i;
435:     ridx = a->compressedrow.rindex;
436:     PetscArrayzero(zarray,4*a->mbs);
437:   } else {
438:     mbs = a->mbs;
439:     ii  = a->i;
440:     z   = zarray;
441:   }

443:   for (i=0; i<mbs; i++) {
444:     n = ii[1] - ii[0];
445:     ii++;
446:     sum1 = 0.0;
447:     sum2 = 0.0;
448:     sum3 = 0.0;
449:     sum4 = 0.0;

451:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
452:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
453:     for (j=0; j<n; j++) {
454:       xb    = x + 4*(*idx++);
455:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
456:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
457:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
458:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
459:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
460:       v    += 16;
461:     }
462:     if (usecprow) z = zarray + 4*ridx[i];
463:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
464:     if (!usecprow) z += 4;
465:   }
466:   VecRestoreArrayRead(xx,&x);
467:   VecRestoreArrayWrite(zz,&zarray);
468:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
469:   return 0;
470: }

472: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
473: {
474:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
475:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
476:   const PetscScalar *xb,*x;
477:   const MatScalar   *v;
478:   const PetscInt    *idx,*ii,*ridx=NULL;
479:   PetscInt          mbs,i,j,n;
480:   PetscBool         usecprow=a->compressedrow.use;

482:   VecGetArrayRead(xx,&x);
483:   VecGetArrayWrite(zz,&zarray);

485:   idx = a->j;
486:   v   = a->a;
487:   if (usecprow) {
488:     mbs  = a->compressedrow.nrows;
489:     ii   = a->compressedrow.i;
490:     ridx = a->compressedrow.rindex;
491:     PetscArrayzero(zarray,5*a->mbs);
492:   } else {
493:     mbs = a->mbs;
494:     ii  = a->i;
495:     z   = zarray;
496:   }

498:   for (i=0; i<mbs; i++) {
499:     n           = ii[1] - ii[0]; ii++;
500:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
501:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
502:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503:     for (j=0; j<n; j++) {
504:       xb    = x + 5*(*idx++);
505:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
506:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
507:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
508:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
509:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
510:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
511:       v    += 25;
512:     }
513:     if (usecprow) z = zarray + 5*ridx[i];
514:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
515:     if (!usecprow) z += 5;
516:   }
517:   VecRestoreArrayRead(xx,&x);
518:   VecRestoreArrayWrite(zz,&zarray);
519:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
520:   return 0;
521: }

523: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
524: {
525:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
526:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
527:   const PetscScalar *x,*xb;
528:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
529:   const MatScalar   *v;
530:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
531:   PetscBool         usecprow=a->compressedrow.use;

533:   VecGetArrayRead(xx,&x);
534:   VecGetArrayWrite(zz,&zarray);

536:   idx = a->j;
537:   v   = a->a;
538:   if (usecprow) {
539:     mbs  = a->compressedrow.nrows;
540:     ii   = a->compressedrow.i;
541:     ridx = a->compressedrow.rindex;
542:     PetscArrayzero(zarray,6*a->mbs);
543:   } else {
544:     mbs = a->mbs;
545:     ii  = a->i;
546:     z   = zarray;
547:   }

549:   for (i=0; i<mbs; i++) {
550:     n  = ii[1] - ii[0];
551:     ii++;
552:     sum1 = 0.0;
553:     sum2 = 0.0;
554:     sum3 = 0.0;
555:     sum4 = 0.0;
556:     sum5 = 0.0;
557:     sum6 = 0.0;

559:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
560:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
561:     for (j=0; j<n; j++) {
562:       xb    = x + 6*(*idx++);
563:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
564:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
565:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
566:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
567:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
568:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
569:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
570:       v    += 36;
571:     }
572:     if (usecprow) z = zarray + 6*ridx[i];
573:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
574:     if (!usecprow) z += 6;
575:   }

577:   VecRestoreArrayRead(xx,&x);
578:   VecRestoreArrayWrite(zz,&zarray);
579:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
580:   return 0;
581: }

583: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
584: {
585:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
586:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
587:   const PetscScalar *x,*xb;
588:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
589:   const MatScalar   *v;
590:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
591:   PetscBool         usecprow=a->compressedrow.use;

593:   VecGetArrayRead(xx,&x);
594:   VecGetArrayWrite(zz,&zarray);

596:   idx = a->j;
597:   v   = a->a;
598:   if (usecprow) {
599:     mbs  = a->compressedrow.nrows;
600:     ii   = a->compressedrow.i;
601:     ridx = a->compressedrow.rindex;
602:     PetscArrayzero(zarray,7*a->mbs);
603:   } else {
604:     mbs = a->mbs;
605:     ii  = a->i;
606:     z   = zarray;
607:   }

609:   for (i=0; i<mbs; i++) {
610:     n  = ii[1] - ii[0];
611:     ii++;
612:     sum1 = 0.0;
613:     sum2 = 0.0;
614:     sum3 = 0.0;
615:     sum4 = 0.0;
616:     sum5 = 0.0;
617:     sum6 = 0.0;
618:     sum7 = 0.0;

620:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
621:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
622:     for (j=0; j<n; j++) {
623:       xb    = x + 7*(*idx++);
624:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
625:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
626:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
627:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
628:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
629:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
630:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
631:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
632:       v    += 49;
633:     }
634:     if (usecprow) z = zarray + 7*ridx[i];
635:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
636:     if (!usecprow) z += 7;
637:   }

639:   VecRestoreArrayRead(xx,&x);
640:   VecRestoreArrayWrite(zz,&zarray);
641:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
642:   return 0;
643: }

645: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
646: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
647: {
648:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
649:   PetscScalar       *z = NULL,*work,*workt,*zarray;
650:   const PetscScalar *x,*xb;
651:   const MatScalar   *v;
652:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
653:   const PetscInt    *idx,*ii,*ridx=NULL;
654:   PetscInt          k;
655:   PetscBool         usecprow=a->compressedrow.use;

657:   __m256d a0,a1,a2,a3,a4,a5;
658:   __m256d w0,w1,w2,w3;
659:   __m256d z0,z1,z2;
660:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

662:   VecGetArrayRead(xx,&x);
663:   VecGetArrayWrite(zz,&zarray);

665:   idx = a->j;
666:   v   = a->a;
667:   if (usecprow) {
668:     mbs  = a->compressedrow.nrows;
669:     ii   = a->compressedrow.i;
670:     ridx = a->compressedrow.rindex;
671:     PetscArrayzero(zarray,bs*a->mbs);
672:   } else {
673:     mbs = a->mbs;
674:     ii  = a->i;
675:     z   = zarray;
676:   }

678:   if (!a->mult_work) {
679:     k    = PetscMax(A->rmap->n,A->cmap->n);
680:     PetscMalloc1(k+1,&a->mult_work);
681:   }

683:   work = a->mult_work;
684:   for (i=0; i<mbs; i++) {
685:     n           = ii[1] - ii[0]; ii++;
686:     workt       = work;
687:     for (j=0; j<n; j++) {
688:       xb = x + bs*(*idx++);
689:       for (k=0; k<bs; k++) workt[k] = xb[k];
690:       workt += bs;
691:     }
692:     if (usecprow) z = zarray + bs*ridx[i];

694:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

696:     for (j=0; j<n; j++) {
697:       /* first column of a */
698:       w0 = _mm256_set1_pd(work[j*9  ]);
699:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
700:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
701:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

703:       /* second column of a */
704:       w1 = _mm256_set1_pd(work[j*9+ 1]);
705:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
706:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
707:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

709:       /* third column of a */
710:       w2 = _mm256_set1_pd(work[j*9 +2]);
711:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
712:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
713:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

715:       /* fourth column of a */
716:       w3 = _mm256_set1_pd(work[j*9+ 3]);
717:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
718:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
719:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

721:       /* fifth column of a */
722:       w0 = _mm256_set1_pd(work[j*9+ 4]);
723:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
724:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
725:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

727:       /* sixth column of a */
728:       w1 = _mm256_set1_pd(work[j*9+ 5]);
729:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
730:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
731:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

733:       /* seventh column of a */
734:       w2 = _mm256_set1_pd(work[j*9+ 6]);
735:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
736:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
737:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

739:       /* eighth column of a */
740:       w3 = _mm256_set1_pd(work[j*9+ 7]);
741:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
742:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
743:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

745:       /* ninth column of a */
746:       w0 = _mm256_set1_pd(work[j*9+ 8]);
747:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
748:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
749:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
750:     }

752:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

754:     v += n*bs2;
755:     if (!usecprow) z += bs;
756:   }
757:   VecRestoreArrayRead(xx,&x);
758:   VecRestoreArrayWrite(zz,&zarray);
759:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
760:   return 0;
761: }
762: #endif

764: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
765: {
766:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
767:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
768:   const PetscScalar *x,*xb;
769:   PetscScalar       *zarray,xv;
770:   const MatScalar   *v;
771:   const PetscInt    *ii,*ij=a->j,*idx;
772:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
773:   PetscBool         usecprow=a->compressedrow.use;

775:   VecGetArrayRead(xx,&x);
776:   VecGetArrayWrite(zz,&zarray);

778:   v = a->a;
779:   if (usecprow) {
780:     mbs  = a->compressedrow.nrows;
781:     ii   = a->compressedrow.i;
782:     ridx = a->compressedrow.rindex;
783:     PetscArrayzero(zarray,11*a->mbs);
784:   } else {
785:     mbs = a->mbs;
786:     ii  = a->i;
787:     z   = zarray;
788:   }

790:   for (i=0; i<mbs; i++) {
791:     n    = ii[i+1] - ii[i];
792:     idx  = ij + ii[i];
793:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
794:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

796:     for (j=0; j<n; j++) {
797:       xb = x + 11*(idx[j]);

799:       for (k=0; k<11; k++) {
800:         xv     =  xb[k];
801:         sum1  += v[0]*xv;
802:         sum2  += v[1]*xv;
803:         sum3  += v[2]*xv;
804:         sum4  += v[3]*xv;
805:         sum5  += v[4]*xv;
806:         sum6  += v[5]*xv;
807:         sum7  += v[6]*xv;
808:         sum8  += v[7]*xv;
809:         sum9  += v[8]*xv;
810:         sum10 += v[9]*xv;
811:         sum11 += v[10]*xv;
812:         v     += 11;
813:       }
814:     }
815:     if (usecprow) z = zarray + 11*ridx[i];
816:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
817:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

819:     if (!usecprow) z += 11;
820:   }

822:   VecRestoreArrayRead(xx,&x);
823:   VecRestoreArrayWrite(zz,&zarray);
824:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
825:   return 0;
826: }

828: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
829: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
830: {
831:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
832:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
833:   const PetscScalar *x,*xb;
834:   PetscScalar       *zarray,xv;
835:   const MatScalar   *v;
836:   const PetscInt    *ii,*ij=a->j,*idx;
837:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
838:   PetscBool         usecprow=a->compressedrow.use;

840:   VecGetArrayRead(xx,&x);
841:   VecGetArrayWrite(zz,&zarray);

843:   v = a->a;
844:   if (usecprow) {
845:     mbs  = a->compressedrow.nrows;
846:     ii   = a->compressedrow.i;
847:     ridx = a->compressedrow.rindex;
848:     PetscArrayzero(zarray,12*a->mbs);
849:   } else {
850:     mbs = a->mbs;
851:     ii  = a->i;
852:     z   = zarray;
853:   }

855:   for (i=0; i<mbs; i++) {
856:     n    = ii[i+1] - ii[i];
857:     idx  = ij + ii[i];
858:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
859:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;

861:     for (j=0; j<n; j++) {
862:       xb = x + 12*(idx[j]);

864:       for (k=0; k<12; k++) {
865:         xv     =  xb[k];
866:         sum1  += v[0]*xv;
867:         sum2  += v[1]*xv;
868:         sum3  += v[2]*xv;
869:         sum4  += v[3]*xv;
870:         sum5  += v[4]*xv;
871:         sum6  += v[5]*xv;
872:         sum7  += v[6]*xv;
873:         sum8  += v[7]*xv;
874:         sum9  += v[8]*xv;
875:         sum10 += v[9]*xv;
876:         sum11 += v[10]*xv;
877:         sum12 += v[11]*xv;
878:         v     += 12;
879:       }
880:     }
881:     if (usecprow) z = zarray + 12*ridx[i];
882:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
883:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
884:     if (!usecprow) z += 12;
885:   }
886:   VecRestoreArrayRead(xx,&x);
887:   VecRestoreArrayWrite(zz,&zarray);
888:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
889:   return 0;
890: }

892: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
893: {
894:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
895:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
896:   const PetscScalar *x,*xb;
897:   PetscScalar       *zarray,*yarray,xv;
898:   const MatScalar   *v;
899:   const PetscInt    *ii,*ij=a->j,*idx;
900:   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
901:   PetscBool         usecprow=a->compressedrow.use;

903:   VecGetArrayRead(xx,&x);
904:   VecGetArrayPair(yy,zz,&yarray,&zarray);

906:   v = a->a;
907:   if (usecprow) {
908:    if (zz != yy) {
909:      PetscArraycpy(zarray,yarray,12*mbs);
910:     }
911:     mbs  = a->compressedrow.nrows;
912:     ii   = a->compressedrow.i;
913:     ridx = a->compressedrow.rindex;
914:   } else {
915:     ii  = a->i;
916:     y   = yarray;
917:     z   = zarray;
918:   }

920:   for (i=0; i<mbs; i++) {
921:     n    = ii[i+1] - ii[i];
922:     idx  = ij + ii[i];

924:     if (usecprow) {
925:       y = yarray + 12*ridx[i];
926:       z = zarray + 12*ridx[i];
927:     }
928:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
929:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

931:     for (j=0; j<n; j++) {
932:       xb = x + 12*(idx[j]);

934:       for (k=0; k<12; k++) {
935:         xv     =  xb[k];
936:         sum1  += v[0]*xv;
937:         sum2  += v[1]*xv;
938:         sum3  += v[2]*xv;
939:         sum4  += v[3]*xv;
940:         sum5  += v[4]*xv;
941:         sum6  += v[5]*xv;
942:         sum7  += v[6]*xv;
943:         sum8  += v[7]*xv;
944:         sum9  += v[8]*xv;
945:         sum10 += v[9]*xv;
946:         sum11 += v[10]*xv;
947:         sum12 += v[11]*xv;
948:         v     += 12;
949:       }
950:     }

952:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
953:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
954:     if (!usecprow) {
955:       y += 12;
956:       z += 12;
957:     }
958:   }
959:   VecRestoreArrayRead(xx,&x);
960:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
961:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
962:   return 0;
963: }

965: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
966: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
967: {
968:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
969:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
970:   const PetscScalar *x,*xb;
971:   PetscScalar       x1,x2,x3,x4,*zarray;
972:   const MatScalar   *v;
973:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
974:   PetscInt          mbs,i,j,n;
975:   PetscBool         usecprow=a->compressedrow.use;

977:   VecGetArrayRead(xx,&x);
978:   VecGetArrayWrite(zz,&zarray);

980:   v = a->a;
981:   if (usecprow) {
982:     mbs  = a->compressedrow.nrows;
983:     ii   = a->compressedrow.i;
984:     ridx = a->compressedrow.rindex;
985:     PetscArrayzero(zarray,12*a->mbs);
986:   } else {
987:     mbs = a->mbs;
988:     ii  = a->i;
989:     z   = zarray;
990:   }

992:   for (i=0; i<mbs; i++) {
993:     n    = ii[i+1] - ii[i];
994:     idx  = ij + ii[i];

996:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
997:     for (j=0; j<n; j++) {
998:       xb = x + 12*(idx[j]);
999:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1001:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1002:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1003:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1004:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1005:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1006:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1007:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1008:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1009:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1010:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1011:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1012:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1013:       v += 48;

1015:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1017:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1018:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1019:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1020:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1021:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1022:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1023:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1024:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1025:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1026:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1027:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1028:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1029:       v     += 48;

1031:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1032:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1033:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1034:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1035:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1036:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1037:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1038:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1039:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1040:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1041:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1042:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1043:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1044:       v     += 48;

1046:     }
1047:     if (usecprow) z = zarray + 12*ridx[i];
1048:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1049:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1050:     if (!usecprow) z += 12;
1051:   }
1052:   VecRestoreArrayRead(xx,&x);
1053:   VecRestoreArrayWrite(zz,&zarray);
1054:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1055:   return 0;
1056: }

1058: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1059: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1060: {
1061:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1062:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1063:   const PetscScalar *x,*xb;
1064:   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1065:   const MatScalar   *v;
1066:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1067:   PetscInt          mbs = a->mbs,i,j,n;
1068:   PetscBool         usecprow=a->compressedrow.use;

1070:   VecGetArrayRead(xx,&x);
1071:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1073:   v = a->a;
1074:   if (usecprow) {
1075:     if (zz != yy) {
1076:       PetscArraycpy(zarray,yarray,12*mbs);
1077:     }
1078:     mbs  = a->compressedrow.nrows;
1079:     ii   = a->compressedrow.i;
1080:     ridx = a->compressedrow.rindex;
1081:   } else {
1082:     ii  = a->i;
1083:     y   = yarray;
1084:     z   = zarray;
1085:   }

1087:   for (i=0; i<mbs; i++) {
1088:     n    = ii[i+1] - ii[i];
1089:     idx  = ij + ii[i];

1091:     if (usecprow) {
1092:       y = yarray + 12*ridx[i];
1093:       z = zarray + 12*ridx[i];
1094:     }
1095:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1096:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

1098:     for (j=0; j<n; j++) {
1099:       xb = x + 12*(idx[j]);
1100:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1102:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1103:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1104:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1105:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1106:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1107:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1108:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1109:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1110:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1111:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1112:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1113:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1114:       v += 48;

1116:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1118:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1119:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1120:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1121:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1122:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1123:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1124:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1125:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1126:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1127:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1128:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1129:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1130:       v     += 48;

1132:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1133:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1134:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1135:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1136:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1137:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1138:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1139:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1140:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1141:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1142:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1143:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1144:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1145:       v     += 48;

1147:     }
1148:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1149:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1150:     if (!usecprow) {
1151:       y += 12;
1152:       z += 12;
1153:     }
1154:   }
1155:   VecRestoreArrayRead(xx,&x);
1156:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1157:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1158:   return 0;
1159: }

1161: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1162: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1163: {
1164:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1165:   PetscScalar       *z = NULL,*zarray;
1166:   const PetscScalar *x,*work;
1167:   const MatScalar   *v = a->a;
1168:   PetscInt          mbs,i,j,n;
1169:   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1170:   PetscBool         usecprow=a->compressedrow.use;
1171:   const PetscInt    bs = 12, bs2 = 144;

1173:   __m256d a0,a1,a2,a3,a4,a5;
1174:   __m256d w0,w1,w2,w3;
1175:   __m256d z0,z1,z2;

1177:   VecGetArrayRead(xx,&x);
1178:   VecGetArrayWrite(zz,&zarray);

1180:   if (usecprow) {
1181:     mbs  = a->compressedrow.nrows;
1182:     ii   = a->compressedrow.i;
1183:     ridx = a->compressedrow.rindex;
1184:     PetscArrayzero(zarray,bs*a->mbs);
1185:   } else {
1186:     mbs = a->mbs;
1187:     ii  = a->i;
1188:     z   = zarray;
1189:   }

1191:   for (i=0; i<mbs; i++) {
1192:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

1194:     n  = ii[1] - ii[0]; ii++;
1195:     for (j=0; j<n; j++) {
1196:       work = x + bs*(*idx++);

1198:       /* first column of a */
1199:       w0 = _mm256_set1_pd(work[0]);
1200:       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1201:       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1202:       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);

1204:       /* second column of a */
1205:       w1 = _mm256_set1_pd(work[1]);
1206:       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1207:       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1208:       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);

1210:       /* third column of a */
1211:       w2 = _mm256_set1_pd(work[2]);
1212:       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1213:       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1214:       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);

1216:       /* fourth column of a */
1217:       w3 = _mm256_set1_pd(work[3]);
1218:       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1219:       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1220:       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);

1222:       /* fifth column of a */
1223:       w0 = _mm256_set1_pd(work[4]);
1224:       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1225:       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1226:       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);

1228:       /* sixth column of a */
1229:       w1 = _mm256_set1_pd(work[5]);
1230:       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1231:       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1232:       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);

1234:       /* seventh column of a */
1235:       w2 = _mm256_set1_pd(work[6]);
1236:       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1237:       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1238:       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);

1240:       /* eighth column of a */
1241:       w3 = _mm256_set1_pd(work[7]);
1242:       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1243:       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1244:       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);

1246:       /* ninth column of a */
1247:       w0 = _mm256_set1_pd(work[8]);
1248:       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1249:       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1250:       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);

1252:       /* tenth column of a */
1253:       w1 = _mm256_set1_pd(work[9]);
1254:       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1255:       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1256:       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);

1258:       /* eleventh column of a */
1259:       w2 = _mm256_set1_pd(work[10]);
1260:       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1261:       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1262:       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);

1264:       /* twelveth column of a */
1265:       w3 = _mm256_set1_pd(work[11]);
1266:       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1267:       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1268:       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);

1270:       v += bs2;
1271:     }
1272:     if (usecprow) z = zarray + bs*ridx[i];
1273:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1274:     if (!usecprow) z += bs;
1275:   }
1276:   VecRestoreArrayRead(xx,&x);
1277:   VecRestoreArrayWrite(zz,&zarray);
1278:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1279:   return 0;
1280: }
1281: #endif

1283: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1284: /* Default MatMult for block size 15 */
1285: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1286: {
1287:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1288:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1289:   const PetscScalar *x,*xb;
1290:   PetscScalar       *zarray,xv;
1291:   const MatScalar   *v;
1292:   const PetscInt    *ii,*ij=a->j,*idx;
1293:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1294:   PetscBool         usecprow=a->compressedrow.use;

1296:   VecGetArrayRead(xx,&x);
1297:   VecGetArrayWrite(zz,&zarray);

1299:   v = a->a;
1300:   if (usecprow) {
1301:     mbs  = a->compressedrow.nrows;
1302:     ii   = a->compressedrow.i;
1303:     ridx = a->compressedrow.rindex;
1304:     PetscArrayzero(zarray,15*a->mbs);
1305:   } else {
1306:     mbs = a->mbs;
1307:     ii  = a->i;
1308:     z   = zarray;
1309:   }

1311:   for (i=0; i<mbs; i++) {
1312:     n    = ii[i+1] - ii[i];
1313:     idx  = ij + ii[i];
1314:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1315:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1317:     for (j=0; j<n; j++) {
1318:       xb = x + 15*(idx[j]);

1320:       for (k=0; k<15; k++) {
1321:         xv     =  xb[k];
1322:         sum1  += v[0]*xv;
1323:         sum2  += v[1]*xv;
1324:         sum3  += v[2]*xv;
1325:         sum4  += v[3]*xv;
1326:         sum5  += v[4]*xv;
1327:         sum6  += v[5]*xv;
1328:         sum7  += v[6]*xv;
1329:         sum8  += v[7]*xv;
1330:         sum9  += v[8]*xv;
1331:         sum10 += v[9]*xv;
1332:         sum11 += v[10]*xv;
1333:         sum12 += v[11]*xv;
1334:         sum13 += v[12]*xv;
1335:         sum14 += v[13]*xv;
1336:         sum15 += v[14]*xv;
1337:         v     += 15;
1338:       }
1339:     }
1340:     if (usecprow) z = zarray + 15*ridx[i];
1341:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1342:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1344:     if (!usecprow) z += 15;
1345:   }

1347:   VecRestoreArrayRead(xx,&x);
1348:   VecRestoreArrayWrite(zz,&zarray);
1349:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1350:   return 0;
1351: }

1353: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1354: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1355: {
1356:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1357:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1358:   const PetscScalar *x,*xb;
1359:   PetscScalar       x1,x2,x3,x4,*zarray;
1360:   const MatScalar   *v;
1361:   const PetscInt    *ii,*ij=a->j,*idx;
1362:   PetscInt          mbs,i,j,n,*ridx=NULL;
1363:   PetscBool         usecprow=a->compressedrow.use;

1365:   VecGetArrayRead(xx,&x);
1366:   VecGetArrayWrite(zz,&zarray);

1368:   v = a->a;
1369:   if (usecprow) {
1370:     mbs  = a->compressedrow.nrows;
1371:     ii   = a->compressedrow.i;
1372:     ridx = a->compressedrow.rindex;
1373:     PetscArrayzero(zarray,15*a->mbs);
1374:   } else {
1375:     mbs = a->mbs;
1376:     ii  = a->i;
1377:     z   = zarray;
1378:   }

1380:   for (i=0; i<mbs; i++) {
1381:     n    = ii[i+1] - ii[i];
1382:     idx  = ij + ii[i];
1383:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1384:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1386:     for (j=0; j<n; j++) {
1387:       xb = x + 15*(idx[j]);
1388:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1390:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1391:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1392:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1393:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1394:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1395:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1396:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1397:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1398:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1399:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1400:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1401:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1402:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1403:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1404:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

1406:       v += 60;

1408:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1410:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1411:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1412:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1413:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1414:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1415:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1416:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1417:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1418:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1419:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1420:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1421:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1422:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1423:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1424:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1425:       v     += 60;

1427:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1428:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1429:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1430:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1431:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1432:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1433:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1434:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1435:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1436:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1437:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1438:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1439:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1440:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1441:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1442:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1443:       v     += 60;

1445:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1446:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1447:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1448:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1449:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1450:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1451:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1452:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1453:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1454:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1455:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1456:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1457:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1458:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1459:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1460:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1461:       v     += 45;
1462:     }
1463:     if (usecprow) z = zarray + 15*ridx[i];
1464:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1465:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1467:     if (!usecprow) z += 15;
1468:   }

1470:   VecRestoreArrayRead(xx,&x);
1471:   VecRestoreArrayWrite(zz,&zarray);
1472:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1473:   return 0;
1474: }

1476: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1477: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1478: {
1479:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1480:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1481:   const PetscScalar *x,*xb;
1482:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1483:   const MatScalar   *v;
1484:   const PetscInt    *ii,*ij=a->j,*idx;
1485:   PetscInt          mbs,i,j,n,*ridx=NULL;
1486:   PetscBool         usecprow=a->compressedrow.use;

1488:   VecGetArrayRead(xx,&x);
1489:   VecGetArrayWrite(zz,&zarray);

1491:   v = a->a;
1492:   if (usecprow) {
1493:     mbs  = a->compressedrow.nrows;
1494:     ii   = a->compressedrow.i;
1495:     ridx = a->compressedrow.rindex;
1496:     PetscArrayzero(zarray,15*a->mbs);
1497:   } else {
1498:     mbs = a->mbs;
1499:     ii  = a->i;
1500:     z   = zarray;
1501:   }

1503:   for (i=0; i<mbs; i++) {
1504:     n    = ii[i+1] - ii[i];
1505:     idx  = ij + ii[i];
1506:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1507:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1509:     for (j=0; j<n; j++) {
1510:       xb = x + 15*(idx[j]);
1511:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1512:       x8 = xb[7];

1514:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1515:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1516:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1517:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1518:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1519:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1520:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1521:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1522:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1523:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1524:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1525:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1526:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1527:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1528:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1529:       v     += 120;

1531:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1533:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1534:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1535:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1536:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1537:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1538:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1539:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1540:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1541:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1542:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1543:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1544:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1545:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1546:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1547:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1548:       v     += 105;
1549:     }
1550:     if (usecprow) z = zarray + 15*ridx[i];
1551:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1552:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1554:     if (!usecprow) z += 15;
1555:   }

1557:   VecRestoreArrayRead(xx,&x);
1558:   VecRestoreArrayWrite(zz,&zarray);
1559:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1560:   return 0;
1561: }

1563: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1564: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1565: {
1566:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1567:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1568:   const PetscScalar *x,*xb;
1569:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1570:   const MatScalar   *v;
1571:   const PetscInt    *ii,*ij=a->j,*idx;
1572:   PetscInt          mbs,i,j,n,*ridx=NULL;
1573:   PetscBool         usecprow=a->compressedrow.use;

1575:   VecGetArrayRead(xx,&x);
1576:   VecGetArrayWrite(zz,&zarray);

1578:   v = a->a;
1579:   if (usecprow) {
1580:     mbs  = a->compressedrow.nrows;
1581:     ii   = a->compressedrow.i;
1582:     ridx = a->compressedrow.rindex;
1583:     PetscArrayzero(zarray,15*a->mbs);
1584:   } else {
1585:     mbs = a->mbs;
1586:     ii  = a->i;
1587:     z   = zarray;
1588:   }

1590:   for (i=0; i<mbs; i++) {
1591:     n    = ii[i+1] - ii[i];
1592:     idx  = ij + ii[i];
1593:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1594:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1596:     for (j=0; j<n; j++) {
1597:       xb = x + 15*(idx[j]);
1598:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1599:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1601:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1602:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1603:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1604:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1605:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1606:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1607:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1608:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1609:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1610:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1611:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1612:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1613:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1614:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1615:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1616:       v     += 225;
1617:     }
1618:     if (usecprow) z = zarray + 15*ridx[i];
1619:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1620:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1622:     if (!usecprow) z += 15;
1623:   }

1625:   VecRestoreArrayRead(xx,&x);
1626:   VecRestoreArrayWrite(zz,&zarray);
1627:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1628:   return 0;
1629: }

1631: /*
1632:     This will not work with MatScalar == float because it calls the BLAS
1633: */
1634: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1635: {
1636:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1637:   PetscScalar       *z = NULL,*work,*workt,*zarray;
1638:   const PetscScalar *x,*xb;
1639:   const MatScalar   *v;
1640:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1641:   const PetscInt    *idx,*ii,*ridx=NULL;
1642:   PetscInt          ncols,k;
1643:   PetscBool         usecprow=a->compressedrow.use;

1645:   VecGetArrayRead(xx,&x);
1646:   VecGetArrayWrite(zz,&zarray);

1648:   idx = a->j;
1649:   v   = a->a;
1650:   if (usecprow) {
1651:     mbs  = a->compressedrow.nrows;
1652:     ii   = a->compressedrow.i;
1653:     ridx = a->compressedrow.rindex;
1654:     PetscArrayzero(zarray,bs*a->mbs);
1655:   } else {
1656:     mbs = a->mbs;
1657:     ii  = a->i;
1658:     z   = zarray;
1659:   }

1661:   if (!a->mult_work) {
1662:     k    = PetscMax(A->rmap->n,A->cmap->n);
1663:     PetscMalloc1(k+1,&a->mult_work);
1664:   }
1665:   work = a->mult_work;
1666:   for (i=0; i<mbs; i++) {
1667:     n           = ii[1] - ii[0]; ii++;
1668:     ncols       = n*bs;
1669:     workt       = work;
1670:     for (j=0; j<n; j++) {
1671:       xb = x + bs*(*idx++);
1672:       for (k=0; k<bs; k++) workt[k] = xb[k];
1673:       workt += bs;
1674:     }
1675:     if (usecprow) z = zarray + bs*ridx[i];
1676:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1677:     v += n*bs2;
1678:     if (!usecprow) z += bs;
1679:   }
1680:   VecRestoreArrayRead(xx,&x);
1681:   VecRestoreArrayWrite(zz,&zarray);
1682:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1683:   return 0;
1684: }

1686: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1687: {
1688:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1689:   const PetscScalar *x;
1690:   PetscScalar       *y,*z,sum;
1691:   const MatScalar   *v;
1692:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1693:   const PetscInt    *idx,*ii;
1694:   PetscBool         usecprow=a->compressedrow.use;

1696:   VecGetArrayRead(xx,&x);
1697:   VecGetArrayPair(yy,zz,&y,&z);

1699:   idx = a->j;
1700:   v   = a->a;
1701:   if (usecprow) {
1702:     if (zz != yy) {
1703:       PetscArraycpy(z,y,mbs);
1704:     }
1705:     mbs  = a->compressedrow.nrows;
1706:     ii   = a->compressedrow.i;
1707:     ridx = a->compressedrow.rindex;
1708:   } else {
1709:     ii = a->i;
1710:   }

1712:   for (i=0; i<mbs; i++) {
1713:     n = ii[1] - ii[0];
1714:     ii++;
1715:     if (!usecprow) {
1716:       sum = y[i];
1717:     } else {
1718:       sum = y[ridx[i]];
1719:     }
1720:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1721:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1722:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1723:     v   += n;
1724:     idx += n;
1725:     if (usecprow) {
1726:       z[ridx[i]] = sum;
1727:     } else {
1728:       z[i] = sum;
1729:     }
1730:   }
1731:   VecRestoreArrayRead(xx,&x);
1732:   VecRestoreArrayPair(yy,zz,&y,&z);
1733:   PetscLogFlops(2.0*a->nz);
1734:   return 0;
1735: }

1737: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1738: {
1739:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1740:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1741:   const PetscScalar *x,*xb;
1742:   PetscScalar       x1,x2,*yarray,*zarray;
1743:   const MatScalar   *v;
1744:   PetscInt          mbs = a->mbs,i,n,j;
1745:   const PetscInt    *idx,*ii,*ridx = NULL;
1746:   PetscBool         usecprow = a->compressedrow.use;

1748:   VecGetArrayRead(xx,&x);
1749:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1751:   idx = a->j;
1752:   v   = a->a;
1753:   if (usecprow) {
1754:     if (zz != yy) {
1755:       PetscArraycpy(zarray,yarray,2*mbs);
1756:     }
1757:     mbs  = a->compressedrow.nrows;
1758:     ii   = a->compressedrow.i;
1759:     ridx = a->compressedrow.rindex;
1760:   } else {
1761:     ii = a->i;
1762:     y  = yarray;
1763:     z  = zarray;
1764:   }

1766:   for (i=0; i<mbs; i++) {
1767:     n = ii[1] - ii[0]; ii++;
1768:     if (usecprow) {
1769:       z = zarray + 2*ridx[i];
1770:       y = yarray + 2*ridx[i];
1771:     }
1772:     sum1 = y[0]; sum2 = y[1];
1773:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1774:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1775:     for (j=0; j<n; j++) {
1776:       xb = x + 2*(*idx++);
1777:       x1 = xb[0];
1778:       x2 = xb[1];

1780:       sum1 += v[0]*x1 + v[2]*x2;
1781:       sum2 += v[1]*x1 + v[3]*x2;
1782:       v    += 4;
1783:     }
1784:     z[0] = sum1; z[1] = sum2;
1785:     if (!usecprow) {
1786:       z += 2; y += 2;
1787:     }
1788:   }
1789:   VecRestoreArrayRead(xx,&x);
1790:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1791:   PetscLogFlops(4.0*a->nz);
1792:   return 0;
1793: }

1795: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1796: {
1797:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1798:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1799:   const PetscScalar *x,*xb;
1800:   const MatScalar   *v;
1801:   PetscInt          mbs = a->mbs,i,j,n;
1802:   const PetscInt    *idx,*ii,*ridx = NULL;
1803:   PetscBool         usecprow = a->compressedrow.use;

1805:   VecGetArrayRead(xx,&x);
1806:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1808:   idx = a->j;
1809:   v   = a->a;
1810:   if (usecprow) {
1811:     if (zz != yy) {
1812:       PetscArraycpy(zarray,yarray,3*mbs);
1813:     }
1814:     mbs  = a->compressedrow.nrows;
1815:     ii   = a->compressedrow.i;
1816:     ridx = a->compressedrow.rindex;
1817:   } else {
1818:     ii = a->i;
1819:     y  = yarray;
1820:     z  = zarray;
1821:   }

1823:   for (i=0; i<mbs; i++) {
1824:     n = ii[1] - ii[0]; ii++;
1825:     if (usecprow) {
1826:       z = zarray + 3*ridx[i];
1827:       y = yarray + 3*ridx[i];
1828:     }
1829:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1830:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1831:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1832:     for (j=0; j<n; j++) {
1833:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1834:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1835:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1836:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1837:       v    += 9;
1838:     }
1839:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1840:     if (!usecprow) {
1841:       z += 3; y += 3;
1842:     }
1843:   }
1844:   VecRestoreArrayRead(xx,&x);
1845:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1846:   PetscLogFlops(18.0*a->nz);
1847:   return 0;
1848: }

1850: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1851: {
1852:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1853:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1854:   const PetscScalar *x,*xb;
1855:   const MatScalar   *v;
1856:   PetscInt          mbs = a->mbs,i,j,n;
1857:   const PetscInt    *idx,*ii,*ridx=NULL;
1858:   PetscBool         usecprow=a->compressedrow.use;

1860:   VecGetArrayRead(xx,&x);
1861:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1863:   idx = a->j;
1864:   v   = a->a;
1865:   if (usecprow) {
1866:     if (zz != yy) {
1867:       PetscArraycpy(zarray,yarray,4*mbs);
1868:     }
1869:     mbs  = a->compressedrow.nrows;
1870:     ii   = a->compressedrow.i;
1871:     ridx = a->compressedrow.rindex;
1872:   } else {
1873:     ii = a->i;
1874:     y  = yarray;
1875:     z  = zarray;
1876:   }

1878:   for (i=0; i<mbs; i++) {
1879:     n = ii[1] - ii[0]; ii++;
1880:     if (usecprow) {
1881:       z = zarray + 4*ridx[i];
1882:       y = yarray + 4*ridx[i];
1883:     }
1884:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1885:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1886:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1887:     for (j=0; j<n; j++) {
1888:       xb    = x + 4*(*idx++);
1889:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1890:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1891:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1892:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1893:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1894:       v    += 16;
1895:     }
1896:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1897:     if (!usecprow) {
1898:       z += 4; y += 4;
1899:     }
1900:   }
1901:   VecRestoreArrayRead(xx,&x);
1902:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1903:   PetscLogFlops(32.0*a->nz);
1904:   return 0;
1905: }

1907: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1908: {
1909:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1910:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1911:   const PetscScalar *x,*xb;
1912:   PetscScalar       *yarray,*zarray;
1913:   const MatScalar   *v;
1914:   PetscInt          mbs = a->mbs,i,j,n;
1915:   const PetscInt    *idx,*ii,*ridx = NULL;
1916:   PetscBool         usecprow=a->compressedrow.use;

1918:   VecGetArrayRead(xx,&x);
1919:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1921:   idx = a->j;
1922:   v   = a->a;
1923:   if (usecprow) {
1924:     if (zz != yy) {
1925:       PetscArraycpy(zarray,yarray,5*mbs);
1926:     }
1927:     mbs  = a->compressedrow.nrows;
1928:     ii   = a->compressedrow.i;
1929:     ridx = a->compressedrow.rindex;
1930:   } else {
1931:     ii = a->i;
1932:     y  = yarray;
1933:     z  = zarray;
1934:   }

1936:   for (i=0; i<mbs; i++) {
1937:     n = ii[1] - ii[0]; ii++;
1938:     if (usecprow) {
1939:       z = zarray + 5*ridx[i];
1940:       y = yarray + 5*ridx[i];
1941:     }
1942:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1943:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1944:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1945:     for (j=0; j<n; j++) {
1946:       xb    = x + 5*(*idx++);
1947:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1948:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1949:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1950:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1951:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1952:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1953:       v    += 25;
1954:     }
1955:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1956:     if (!usecprow) {
1957:       z += 5; y += 5;
1958:     }
1959:   }
1960:   VecRestoreArrayRead(xx,&x);
1961:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1962:   PetscLogFlops(50.0*a->nz);
1963:   return 0;
1964: }

1966: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1967: {
1968:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1969:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
1970:   const PetscScalar *x,*xb;
1971:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1972:   const MatScalar   *v;
1973:   PetscInt          mbs = a->mbs,i,j,n;
1974:   const PetscInt    *idx,*ii,*ridx=NULL;
1975:   PetscBool         usecprow=a->compressedrow.use;

1977:   VecGetArrayRead(xx,&x);
1978:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1980:   idx = a->j;
1981:   v   = a->a;
1982:   if (usecprow) {
1983:     if (zz != yy) {
1984:       PetscArraycpy(zarray,yarray,6*mbs);
1985:     }
1986:     mbs  = a->compressedrow.nrows;
1987:     ii   = a->compressedrow.i;
1988:     ridx = a->compressedrow.rindex;
1989:   } else {
1990:     ii = a->i;
1991:     y  = yarray;
1992:     z  = zarray;
1993:   }

1995:   for (i=0; i<mbs; i++) {
1996:     n = ii[1] - ii[0]; ii++;
1997:     if (usecprow) {
1998:       z = zarray + 6*ridx[i];
1999:       y = yarray + 6*ridx[i];
2000:     }
2001:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2002:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2003:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2004:     for (j=0; j<n; j++) {
2005:       xb    = x + 6*(*idx++);
2006:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2007:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2008:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2009:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2010:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2011:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2012:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2013:       v    += 36;
2014:     }
2015:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2016:     if (!usecprow) {
2017:       z += 6; y += 6;
2018:     }
2019:   }
2020:   VecRestoreArrayRead(xx,&x);
2021:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2022:   PetscLogFlops(72.0*a->nz);
2023:   return 0;
2024: }

2026: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2027: {
2028:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2029:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2030:   const PetscScalar *x,*xb;
2031:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2032:   const MatScalar   *v;
2033:   PetscInt          mbs = a->mbs,i,j,n;
2034:   const PetscInt    *idx,*ii,*ridx = NULL;
2035:   PetscBool         usecprow=a->compressedrow.use;

2037:   VecGetArrayRead(xx,&x);
2038:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2040:   idx = a->j;
2041:   v   = a->a;
2042:   if (usecprow) {
2043:     if (zz != yy) {
2044:       PetscArraycpy(zarray,yarray,7*mbs);
2045:     }
2046:     mbs  = a->compressedrow.nrows;
2047:     ii   = a->compressedrow.i;
2048:     ridx = a->compressedrow.rindex;
2049:   } else {
2050:     ii = a->i;
2051:     y  = yarray;
2052:     z  = zarray;
2053:   }

2055:   for (i=0; i<mbs; i++) {
2056:     n = ii[1] - ii[0]; ii++;
2057:     if (usecprow) {
2058:       z = zarray + 7*ridx[i];
2059:       y = yarray + 7*ridx[i];
2060:     }
2061:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2062:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2063:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2064:     for (j=0; j<n; j++) {
2065:       xb    = x + 7*(*idx++);
2066:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2067:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2068:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2069:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2070:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2071:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2072:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2073:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2074:       v    += 49;
2075:     }
2076:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2077:     if (!usecprow) {
2078:       z += 7; y += 7;
2079:     }
2080:   }
2081:   VecRestoreArrayRead(xx,&x);
2082:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2083:   PetscLogFlops(98.0*a->nz);
2084:   return 0;
2085: }

2087: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2088: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2089: {
2090:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2091:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2092:   const PetscScalar *x,*xb;
2093:   const MatScalar   *v;
2094:   PetscInt          mbs,i,j,n;
2095:   PetscInt          k;
2096:   PetscBool         usecprow=a->compressedrow.use;
2097:   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;

2099:   __m256d a0,a1,a2,a3,a4,a5;
2100:   __m256d w0,w1,w2,w3;
2101:   __m256d z0,z1,z2;
2102:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

2104:   VecCopy(yy,zz);
2105:   VecGetArrayRead(xx,&x);
2106:   VecGetArray(zz,&zarray);

2108:   idx = a->j;
2109:   v   = a->a;
2110:   if (usecprow) {
2111:     mbs  = a->compressedrow.nrows;
2112:     ii   = a->compressedrow.i;
2113:     ridx = a->compressedrow.rindex;
2114:   } else {
2115:     mbs = a->mbs;
2116:     ii  = a->i;
2117:     z   = zarray;
2118:   }

2120:   if (!a->mult_work) {
2121:     k    = PetscMax(A->rmap->n,A->cmap->n);
2122:     PetscMalloc1(k+1,&a->mult_work);
2123:   }

2125:   work = a->mult_work;
2126:   for (i=0; i<mbs; i++) {
2127:     n           = ii[1] - ii[0]; ii++;
2128:     workt       = work;
2129:     for (j=0; j<n; j++) {
2130:       xb = x + bs*(*idx++);
2131:       for (k=0; k<bs; k++) workt[k] = xb[k];
2132:       workt += bs;
2133:     }
2134:     if (usecprow) z = zarray + bs*ridx[i];

2136:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

2138:     for (j=0; j<n; j++) {
2139:       /* first column of a */
2140:       w0 = _mm256_set1_pd(work[j*9  ]);
2141:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2142:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2143:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

2145:       /* second column of a */
2146:       w1 = _mm256_set1_pd(work[j*9+ 1]);
2147:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2148:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2149:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2151:       /* third column of a */
2152:       w2 = _mm256_set1_pd(work[j*9+ 2]);
2153:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2154:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2155:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

2157:       /* fourth column of a */
2158:       w3 = _mm256_set1_pd(work[j*9+ 3]);
2159:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2160:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2161:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

2163:       /* fifth column of a */
2164:       w0 = _mm256_set1_pd(work[j*9+ 4]);
2165:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2166:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2167:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

2169:       /* sixth column of a */
2170:       w1 = _mm256_set1_pd(work[j*9+ 5]);
2171:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2172:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2173:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2175:       /* seventh column of a */
2176:       w2 = _mm256_set1_pd(work[j*9+ 6]);
2177:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2178:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2179:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

2181:       /* eighth column of a */
2182:       w3 = _mm256_set1_pd(work[j*9+ 7]);
2183:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2184:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2185:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

2187:       /* ninth column of a */
2188:       w0 = _mm256_set1_pd(work[j*9+ 8]);
2189:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2190:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2191:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2192:     }

2194:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

2196:     v += n*bs2;
2197:     if (!usecprow) z += bs;
2198:   }
2199:   VecRestoreArrayRead(xx,&x);
2200:   VecRestoreArray(zz,&zarray);
2201:   PetscLogFlops(162.0*a->nz);
2202:   return 0;
2203: }
2204: #endif

2206: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2207: {
2208:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2209:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2210:   const PetscScalar *x,*xb;
2211:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2212:   const MatScalar   *v;
2213:   PetscInt          mbs = a->mbs,i,j,n;
2214:   const PetscInt    *idx,*ii,*ridx = NULL;
2215:   PetscBool         usecprow=a->compressedrow.use;

2217:   VecGetArrayRead(xx,&x);
2218:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2220:   idx = a->j;
2221:   v   = a->a;
2222:   if (usecprow) {
2223:     if (zz != yy) {
2224:       PetscArraycpy(zarray,yarray,7*mbs);
2225:     }
2226:     mbs  = a->compressedrow.nrows;
2227:     ii   = a->compressedrow.i;
2228:     ridx = a->compressedrow.rindex;
2229:   } else {
2230:     ii = a->i;
2231:     y  = yarray;
2232:     z  = zarray;
2233:   }

2235:   for (i=0; i<mbs; i++) {
2236:     n = ii[1] - ii[0]; ii++;
2237:     if (usecprow) {
2238:       z = zarray + 11*ridx[i];
2239:       y = yarray + 11*ridx[i];
2240:     }
2241:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2242:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2243:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2244:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2245:     for (j=0; j<n; j++) {
2246:       xb    = x + 11*(*idx++);
2247:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2248:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2249:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2250:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2251:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2252:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2253:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2254:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2255:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2256:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2257:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2258:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2259:       v    += 121;
2260:     }
2261:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2262:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2263:     if (!usecprow) {
2264:       z += 11; y += 11;
2265:     }
2266:   }
2267:   VecRestoreArrayRead(xx,&x);
2268:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2269:   PetscLogFlops(242.0*a->nz);
2270:   return 0;
2271: }

2273: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2274: {
2275:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2276:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2277:   const PetscScalar *x,*xb;
2278:   const MatScalar   *v;
2279:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2280:   PetscInt          ncols,k;
2281:   const PetscInt    *ridx = NULL,*idx,*ii;
2282:   PetscBool         usecprow = a->compressedrow.use;

2284:   VecCopy(yy,zz);
2285:   VecGetArrayRead(xx,&x);
2286:   VecGetArray(zz,&zarray);

2288:   idx = a->j;
2289:   v   = a->a;
2290:   if (usecprow) {
2291:     mbs  = a->compressedrow.nrows;
2292:     ii   = a->compressedrow.i;
2293:     ridx = a->compressedrow.rindex;
2294:   } else {
2295:     mbs = a->mbs;
2296:     ii  = a->i;
2297:     z   = zarray;
2298:   }

2300:   if (!a->mult_work) {
2301:     k    = PetscMax(A->rmap->n,A->cmap->n);
2302:     PetscMalloc1(k+1,&a->mult_work);
2303:   }
2304:   work = a->mult_work;
2305:   for (i=0; i<mbs; i++) {
2306:     n     = ii[1] - ii[0]; ii++;
2307:     ncols = n*bs;
2308:     workt = work;
2309:     for (j=0; j<n; j++) {
2310:       xb = x + bs*(*idx++);
2311:       for (k=0; k<bs; k++) workt[k] = xb[k];
2312:       workt += bs;
2313:     }
2314:     if (usecprow) z = zarray + bs*ridx[i];
2315:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2316:     v += n*bs2;
2317:     if (!usecprow) z += bs;
2318:   }
2319:   VecRestoreArrayRead(xx,&x);
2320:   VecRestoreArray(zz,&zarray);
2321:   PetscLogFlops(2.0*a->nz*bs2);
2322:   return 0;
2323: }

2325: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2326: {
2327:   PetscScalar    zero = 0.0;

2329:   VecSet(zz,zero);
2330:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2331:   return 0;
2332: }

2334: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2335: {
2336:   PetscScalar    zero = 0.0;

2338:   VecSet(zz,zero);
2339:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2340:   return 0;
2341: }

2343: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2344: {
2345:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2346:   PetscScalar       *z,x1,x2,x3,x4,x5;
2347:   const PetscScalar *x,*xb = NULL;
2348:   const MatScalar   *v;
2349:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2350:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2351:   Mat_CompressedRow cprow = a->compressedrow;
2352:   PetscBool         usecprow = cprow.use;

2354:   if (yy != zz) VecCopy(yy,zz);
2355:   VecGetArrayRead(xx,&x);
2356:   VecGetArray(zz,&z);

2358:   idx = a->j;
2359:   v   = a->a;
2360:   if (usecprow) {
2361:     mbs  = cprow.nrows;
2362:     ii   = cprow.i;
2363:     ridx = cprow.rindex;
2364:   } else {
2365:     mbs=a->mbs;
2366:     ii = a->i;
2367:     xb = x;
2368:   }

2370:   switch (bs) {
2371:   case 1:
2372:     for (i=0; i<mbs; i++) {
2373:       if (usecprow) xb = x + ridx[i];
2374:       x1 = xb[0];
2375:       ib = idx + ii[0];
2376:       n  = ii[1] - ii[0]; ii++;
2377:       for (j=0; j<n; j++) {
2378:         rval     = ib[j];
2379:         z[rval] += PetscConj(*v) * x1;
2380:         v++;
2381:       }
2382:       if (!usecprow) xb++;
2383:     }
2384:     break;
2385:   case 2:
2386:     for (i=0; i<mbs; i++) {
2387:       if (usecprow) xb = x + 2*ridx[i];
2388:       x1 = xb[0]; x2 = xb[1];
2389:       ib = idx + ii[0];
2390:       n  = ii[1] - ii[0]; ii++;
2391:       for (j=0; j<n; j++) {
2392:         rval       = ib[j]*2;
2393:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2394:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2395:         v         += 4;
2396:       }
2397:       if (!usecprow) xb += 2;
2398:     }
2399:     break;
2400:   case 3:
2401:     for (i=0; i<mbs; i++) {
2402:       if (usecprow) xb = x + 3*ridx[i];
2403:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2404:       ib = idx + ii[0];
2405:       n  = ii[1] - ii[0]; ii++;
2406:       for (j=0; j<n; j++) {
2407:         rval       = ib[j]*3;
2408:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2409:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2410:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2411:         v         += 9;
2412:       }
2413:       if (!usecprow) xb += 3;
2414:     }
2415:     break;
2416:   case 4:
2417:     for (i=0; i<mbs; i++) {
2418:       if (usecprow) xb = x + 4*ridx[i];
2419:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2420:       ib = idx + ii[0];
2421:       n  = ii[1] - ii[0]; ii++;
2422:       for (j=0; j<n; j++) {
2423:         rval       = ib[j]*4;
2424:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2425:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2426:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2427:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2428:         v         += 16;
2429:       }
2430:       if (!usecprow) xb += 4;
2431:     }
2432:     break;
2433:   case 5:
2434:     for (i=0; i<mbs; i++) {
2435:       if (usecprow) xb = x + 5*ridx[i];
2436:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2437:       x4 = xb[3]; x5 = xb[4];
2438:       ib = idx + ii[0];
2439:       n  = ii[1] - ii[0]; ii++;
2440:       for (j=0; j<n; j++) {
2441:         rval       = ib[j]*5;
2442:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2443:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2444:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2445:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2446:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2447:         v         += 25;
2448:       }
2449:       if (!usecprow) xb += 5;
2450:     }
2451:     break;
2452:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2453:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2454: #if 0
2455:     {
2456:       PetscInt          ncols,k,bs2=a->bs2;
2457:       PetscScalar       *work,*workt,zb;
2458:       const PetscScalar *xtmp;
2459:       if (!a->mult_work) {
2460:         k    = PetscMax(A->rmap->n,A->cmap->n);
2461:         PetscMalloc1(k+1,&a->mult_work);
2462:       }
2463:       work = a->mult_work;
2464:       xtmp = x;
2465:       for (i=0; i<mbs; i++) {
2466:         n     = ii[1] - ii[0]; ii++;
2467:         ncols = n*bs;
2468:         PetscArrayzero(work,ncols);
2469:         if (usecprow) xtmp = x + bs*ridx[i];
2470:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2471:         v += n*bs2;
2472:         if (!usecprow) xtmp += bs;
2473:         workt = work;
2474:         for (j=0; j<n; j++) {
2475:           zb = z + bs*(*idx++);
2476:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2477:           workt += bs;
2478:         }
2479:       }
2480:     }
2481: #endif
2482:   }
2483:   VecRestoreArrayRead(xx,&x);
2484:   VecRestoreArray(zz,&z);
2485:   PetscLogFlops(2.0*a->nz*a->bs2);
2486:   return 0;
2487: }

2489: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2490: {
2491:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2492:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2493:   const PetscScalar *x,*xb = NULL;
2494:   const MatScalar   *v;
2495:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2496:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2497:   Mat_CompressedRow cprow   = a->compressedrow;
2498:   PetscBool         usecprow=cprow.use;

2500:   if (yy != zz) VecCopy(yy,zz);
2501:   VecGetArrayRead(xx,&x);
2502:   VecGetArray(zz,&z);

2504:   idx = a->j;
2505:   v   = a->a;
2506:   if (usecprow) {
2507:     mbs  = cprow.nrows;
2508:     ii   = cprow.i;
2509:     ridx = cprow.rindex;
2510:   } else {
2511:     mbs=a->mbs;
2512:     ii = a->i;
2513:     xb = x;
2514:   }

2516:   switch (bs) {
2517:   case 1:
2518:     for (i=0; i<mbs; i++) {
2519:       if (usecprow) xb = x + ridx[i];
2520:       x1 = xb[0];
2521:       ib = idx + ii[0];
2522:       n  = ii[1] - ii[0]; ii++;
2523:       for (j=0; j<n; j++) {
2524:         rval     = ib[j];
2525:         z[rval] += *v * x1;
2526:         v++;
2527:       }
2528:       if (!usecprow) xb++;
2529:     }
2530:     break;
2531:   case 2:
2532:     for (i=0; i<mbs; i++) {
2533:       if (usecprow) xb = x + 2*ridx[i];
2534:       x1 = xb[0]; x2 = xb[1];
2535:       ib = idx + ii[0];
2536:       n  = ii[1] - ii[0]; ii++;
2537:       for (j=0; j<n; j++) {
2538:         rval       = ib[j]*2;
2539:         z[rval++] += v[0]*x1 + v[1]*x2;
2540:         z[rval++] += v[2]*x1 + v[3]*x2;
2541:         v         += 4;
2542:       }
2543:       if (!usecprow) xb += 2;
2544:     }
2545:     break;
2546:   case 3:
2547:     for (i=0; i<mbs; i++) {
2548:       if (usecprow) xb = x + 3*ridx[i];
2549:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2550:       ib = idx + ii[0];
2551:       n  = ii[1] - ii[0]; ii++;
2552:       for (j=0; j<n; j++) {
2553:         rval       = ib[j]*3;
2554:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2555:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2556:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2557:         v         += 9;
2558:       }
2559:       if (!usecprow) xb += 3;
2560:     }
2561:     break;
2562:   case 4:
2563:     for (i=0; i<mbs; i++) {
2564:       if (usecprow) xb = x + 4*ridx[i];
2565:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2566:       ib = idx + ii[0];
2567:       n  = ii[1] - ii[0]; ii++;
2568:       for (j=0; j<n; j++) {
2569:         rval       = ib[j]*4;
2570:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2571:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2572:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2573:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2574:         v         += 16;
2575:       }
2576:       if (!usecprow) xb += 4;
2577:     }
2578:     break;
2579:   case 5:
2580:     for (i=0; i<mbs; i++) {
2581:       if (usecprow) xb = x + 5*ridx[i];
2582:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2583:       x4 = xb[3]; x5 = xb[4];
2584:       ib = idx + ii[0];
2585:       n  = ii[1] - ii[0]; ii++;
2586:       for (j=0; j<n; j++) {
2587:         rval       = ib[j]*5;
2588:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2589:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2590:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2591:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2592:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2593:         v         += 25;
2594:       }
2595:       if (!usecprow) xb += 5;
2596:     }
2597:     break;
2598:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2599:     PetscInt          ncols,k;
2600:     PetscScalar       *work,*workt;
2601:     const PetscScalar *xtmp;
2602:     if (!a->mult_work) {
2603:       k    = PetscMax(A->rmap->n,A->cmap->n);
2604:       PetscMalloc1(k+1,&a->mult_work);
2605:     }
2606:     work = a->mult_work;
2607:     xtmp = x;
2608:     for (i=0; i<mbs; i++) {
2609:       n     = ii[1] - ii[0]; ii++;
2610:       ncols = n*bs;
2611:       PetscArrayzero(work,ncols);
2612:       if (usecprow) xtmp = x + bs*ridx[i];
2613:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2614:       v += n*bs2;
2615:       if (!usecprow) xtmp += bs;
2616:       workt = work;
2617:       for (j=0; j<n; j++) {
2618:         zb = z + bs*(*idx++);
2619:         for (k=0; k<bs; k++) zb[k] += workt[k];
2620:         workt += bs;
2621:       }
2622:     }
2623:     }
2624:   }
2625:   VecRestoreArrayRead(xx,&x);
2626:   VecRestoreArray(zz,&z);
2627:   PetscLogFlops(2.0*a->nz*a->bs2);
2628:   return 0;
2629: }

2631: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2632: {
2633:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2634:   PetscInt       totalnz = a->bs2*a->nz;
2635:   PetscScalar    oalpha  = alpha;
2636:   PetscBLASInt   one = 1,tnz;

2638:   PetscBLASIntCast(totalnz,&tnz);
2639:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2640:   PetscLogFlops(totalnz);
2641:   return 0;
2642: }

2644: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2645: {
2646:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2647:   MatScalar      *v  = a->a;
2648:   PetscReal      sum = 0.0;
2649:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2651:   if (type == NORM_FROBENIUS) {
2652: #if defined(PETSC_USE_REAL___FP16)
2653:     PetscBLASInt one = 1,cnt = bs2*nz;
2654:     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2655: #else
2656:     for (i=0; i<bs2*nz; i++) {
2657:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2658:     }
2659: #endif
2660:     *norm = PetscSqrtReal(sum);
2661:     PetscLogFlops(2.0*bs2*nz);
2662:   } else if (type == NORM_1) { /* maximum column sum */
2663:     PetscReal *tmp;
2664:     PetscInt  *bcol = a->j;
2665:     PetscCalloc1(A->cmap->n+1,&tmp);
2666:     for (i=0; i<nz; i++) {
2667:       for (j=0; j<bs; j++) {
2668:         k1 = bs*(*bcol) + j; /* column index */
2669:         for (k=0; k<bs; k++) {
2670:           tmp[k1] += PetscAbsScalar(*v); v++;
2671:         }
2672:       }
2673:       bcol++;
2674:     }
2675:     *norm = 0.0;
2676:     for (j=0; j<A->cmap->n; j++) {
2677:       if (tmp[j] > *norm) *norm = tmp[j];
2678:     }
2679:     PetscFree(tmp);
2680:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2681:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2682:     *norm = 0.0;
2683:     for (k=0; k<bs; k++) {
2684:       for (j=0; j<a->mbs; j++) {
2685:         v   = a->a + bs2*a->i[j] + k;
2686:         sum = 0.0;
2687:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2688:           for (k1=0; k1<bs; k1++) {
2689:             sum += PetscAbsScalar(*v);
2690:             v   += bs;
2691:           }
2692:         }
2693:         if (sum > *norm) *norm = sum;
2694:       }
2695:     }
2696:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2697:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2698:   return 0;
2699: }

2701: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2702: {
2703:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2705:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2706:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2707:     *flg = PETSC_FALSE;
2708:     return 0;
2709:   }

2711:   /* if the a->i are the same */
2712:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2713:   if (!*flg) return 0;

2715:   /* if a->j are the same */
2716:   PetscArraycmp(a->j,b->j,a->nz,flg);
2717:   if (!*flg) return 0;

2719:   /* if a->a are the same */
2720:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2721:   return 0;

2723: }

2725: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2726: {
2727:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2728:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2729:   PetscScalar    *x,zero = 0.0;
2730:   MatScalar      *aa,*aa_j;

2733:   bs   = A->rmap->bs;
2734:   aa   = a->a;
2735:   ai   = a->i;
2736:   aj   = a->j;
2737:   ambs = a->mbs;
2738:   bs2  = a->bs2;

2740:   VecSet(v,zero);
2741:   VecGetArray(v,&x);
2742:   VecGetLocalSize(v,&n);
2744:   for (i=0; i<ambs; i++) {
2745:     for (j=ai[i]; j<ai[i+1]; j++) {
2746:       if (aj[j] == i) {
2747:         row  = i*bs;
2748:         aa_j = aa+j*bs2;
2749:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2750:         break;
2751:       }
2752:     }
2753:   }
2754:   VecRestoreArray(v,&x);
2755:   return 0;
2756: }

2758: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2759: {
2760:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2761:   const PetscScalar *l,*r,*li,*ri;
2762:   PetscScalar       x;
2763:   MatScalar         *aa, *v;
2764:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2765:   const PetscInt    *ai,*aj;

2767:   ai  = a->i;
2768:   aj  = a->j;
2769:   aa  = a->a;
2770:   m   = A->rmap->n;
2771:   n   = A->cmap->n;
2772:   bs  = A->rmap->bs;
2773:   mbs = a->mbs;
2774:   bs2 = a->bs2;
2775:   if (ll) {
2776:     VecGetArrayRead(ll,&l);
2777:     VecGetLocalSize(ll,&lm);
2779:     for (i=0; i<mbs; i++) { /* for each block row */
2780:       M  = ai[i+1] - ai[i];
2781:       li = l + i*bs;
2782:       v  = aa + bs2*ai[i];
2783:       for (j=0; j<M; j++) { /* for each block */
2784:         for (k=0; k<bs2; k++) {
2785:           (*v++) *= li[k%bs];
2786:         }
2787:       }
2788:     }
2789:     VecRestoreArrayRead(ll,&l);
2790:     PetscLogFlops(a->nz);
2791:   }

2793:   if (rr) {
2794:     VecGetArrayRead(rr,&r);
2795:     VecGetLocalSize(rr,&rn);
2797:     for (i=0; i<mbs; i++) { /* for each block row */
2798:       iai = ai[i];
2799:       M   = ai[i+1] - iai;
2800:       v   = aa + bs2*iai;
2801:       for (j=0; j<M; j++) { /* for each block */
2802:         ri = r + bs*aj[iai+j];
2803:         for (k=0; k<bs; k++) {
2804:           x = ri[k];
2805:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2806:           v += bs;
2807:         }
2808:       }
2809:     }
2810:     VecRestoreArrayRead(rr,&r);
2811:     PetscLogFlops(a->nz);
2812:   }
2813:   return 0;
2814: }

2816: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2817: {
2818:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2820:   info->block_size   = a->bs2;
2821:   info->nz_allocated = a->bs2*a->maxnz;
2822:   info->nz_used      = a->bs2*a->nz;
2823:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2824:   info->assemblies   = A->num_ass;
2825:   info->mallocs      = A->info.mallocs;
2826:   info->memory       = ((PetscObject)A)->mem;
2827:   if (A->factortype) {
2828:     info->fill_ratio_given  = A->info.fill_ratio_given;
2829:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2830:     info->factor_mallocs    = A->info.factor_mallocs;
2831:   } else {
2832:     info->fill_ratio_given  = 0;
2833:     info->fill_ratio_needed = 0;
2834:     info->factor_mallocs    = 0;
2835:   }
2836:   return 0;
2837: }

2839: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2840: {
2841:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2843:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2844:   return 0;
2845: }

2847: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2848: {
2849:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2850:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2851:   return 0;
2852: }

2854: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2855: {
2856:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2857:   PetscScalar       *z = NULL,sum1;
2858:   const PetscScalar *xb;
2859:   PetscScalar       x1;
2860:   const MatScalar   *v,*vv;
2861:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2862:   PetscBool         usecprow=a->compressedrow.use;

2864:   idx = a->j;
2865:   v   = a->a;
2866:   if (usecprow) {
2867:     mbs  = a->compressedrow.nrows;
2868:     ii   = a->compressedrow.i;
2869:     ridx = a->compressedrow.rindex;
2870:   } else {
2871:     mbs = a->mbs;
2872:     ii  = a->i;
2873:     z   = c;
2874:   }

2876:   for (i=0; i<mbs; i++) {
2877:     n           = ii[1] - ii[0]; ii++;
2878:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2879:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2880:     if (usecprow) z = c + ridx[i];
2881:     jj = idx;
2882:     vv = v;
2883:     for (k=0; k<cn; k++) {
2884:       idx = jj;
2885:       v = vv;
2886:       sum1    = 0.0;
2887:       for (j=0; j<n; j++) {
2888:         xb    = b + (*idx++); x1 = xb[0+k*bm];
2889:         sum1 += v[0]*x1;
2890:         v    += 1;
2891:       }
2892:       z[0+k*cm] = sum1;
2893:     }
2894:     if (!usecprow) z += 1;
2895:   }
2896:   return 0;
2897: }

2899: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2900: {
2901:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2902:   PetscScalar       *z = NULL,sum1,sum2;
2903:   const PetscScalar *xb;
2904:   PetscScalar       x1,x2;
2905:   const MatScalar   *v,*vv;
2906:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2907:   PetscBool         usecprow=a->compressedrow.use;

2909:   idx = a->j;
2910:   v   = a->a;
2911:   if (usecprow) {
2912:     mbs  = a->compressedrow.nrows;
2913:     ii   = a->compressedrow.i;
2914:     ridx = a->compressedrow.rindex;
2915:   } else {
2916:     mbs = a->mbs;
2917:     ii  = a->i;
2918:     z   = c;
2919:   }

2921:   for (i=0; i<mbs; i++) {
2922:     n           = ii[1] - ii[0]; ii++;
2923:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2924:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2925:     if (usecprow) z = c + 2*ridx[i];
2926:     jj = idx;
2927:     vv = v;
2928:     for (k=0; k<cn; k++) {
2929:       idx = jj;
2930:       v = vv;
2931:       sum1    = 0.0; sum2 = 0.0;
2932:       for (j=0; j<n; j++) {
2933:         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2934:         sum1 += v[0]*x1 + v[2]*x2;
2935:         sum2 += v[1]*x1 + v[3]*x2;
2936:         v    += 4;
2937:       }
2938:       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2939:     }
2940:     if (!usecprow) z += 2;
2941:   }
2942:   return 0;
2943: }

2945: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2946: {
2947:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2948:   PetscScalar       *z = NULL,sum1,sum2,sum3;
2949:   const PetscScalar *xb;
2950:   PetscScalar       x1,x2,x3;
2951:   const MatScalar   *v,*vv;
2952:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2953:   PetscBool         usecprow=a->compressedrow.use;

2955:   idx = a->j;
2956:   v   = a->a;
2957:   if (usecprow) {
2958:     mbs  = a->compressedrow.nrows;
2959:     ii   = a->compressedrow.i;
2960:     ridx = a->compressedrow.rindex;
2961:   } else {
2962:     mbs = a->mbs;
2963:     ii  = a->i;
2964:     z   = c;
2965:   }

2967:   for (i=0; i<mbs; i++) {
2968:     n           = ii[1] - ii[0]; ii++;
2969:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2970:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2971:     if (usecprow) z = c + 3*ridx[i];
2972:     jj = idx;
2973:     vv = v;
2974:     for (k=0; k<cn; k++) {
2975:       idx = jj;
2976:       v = vv;
2977:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
2978:       for (j=0; j<n; j++) {
2979:         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2980:         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2981:         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2982:         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2983:         v    += 9;
2984:       }
2985:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2986:     }
2987:     if (!usecprow) z += 3;
2988:   }
2989:   return 0;
2990: }

2992: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2993: {
2994:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2995:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
2996:   const PetscScalar *xb;
2997:   PetscScalar       x1,x2,x3,x4;
2998:   const MatScalar   *v,*vv;
2999:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3000:   PetscBool         usecprow=a->compressedrow.use;

3002:   idx = a->j;
3003:   v   = a->a;
3004:   if (usecprow) {
3005:     mbs  = a->compressedrow.nrows;
3006:     ii   = a->compressedrow.i;
3007:     ridx = a->compressedrow.rindex;
3008:   } else {
3009:     mbs = a->mbs;
3010:     ii  = a->i;
3011:     z   = c;
3012:   }

3014:   for (i=0; i<mbs; i++) {
3015:     n           = ii[1] - ii[0]; ii++;
3016:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3017:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3018:     if (usecprow) z = c + 4*ridx[i];
3019:     jj = idx;
3020:     vv = v;
3021:     for (k=0; k<cn; k++) {
3022:       idx = jj;
3023:       v = vv;
3024:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3025:       for (j=0; j<n; j++) {
3026:         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3027:         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3028:         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3029:         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3030:         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3031:         v    += 16;
3032:       }
3033:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3034:     }
3035:     if (!usecprow) z += 4;
3036:   }
3037:   return 0;
3038: }

3040: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3041: {
3042:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3043:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3044:   const PetscScalar *xb;
3045:   PetscScalar       x1,x2,x3,x4,x5;
3046:   const MatScalar   *v,*vv;
3047:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3048:   PetscBool         usecprow=a->compressedrow.use;

3050:   idx = a->j;
3051:   v   = a->a;
3052:   if (usecprow) {
3053:     mbs  = a->compressedrow.nrows;
3054:     ii   = a->compressedrow.i;
3055:     ridx = a->compressedrow.rindex;
3056:   } else {
3057:     mbs = a->mbs;
3058:     ii  = a->i;
3059:     z   = c;
3060:   }

3062:   for (i=0; i<mbs; i++) {
3063:     n           = ii[1] - ii[0]; ii++;
3064:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3065:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3066:     if (usecprow) z = c + 5*ridx[i];
3067:     jj = idx;
3068:     vv = v;
3069:     for (k=0; k<cn; k++) {
3070:       idx = jj;
3071:       v = vv;
3072:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3073:       for (j=0; j<n; j++) {
3074:         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3075:         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3076:         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3077:         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3078:         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3079:         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3080:         v    += 25;
3081:       }
3082:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3083:     }
3084:     if (!usecprow) z += 5;
3085:   }
3086:   return 0;
3087: }

3089: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3090: {
3091:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3092:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3093:   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3094:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3095:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3096:   PetscBLASInt      bbs,bcn,bbm,bcm;
3097:   PetscScalar       *z = NULL;
3098:   PetscScalar       *c,*b;
3099:   const MatScalar   *v;
3100:   const PetscInt    *idx,*ii,*ridx=NULL;
3101:   PetscScalar       _DZero=0.0,_DOne=1.0;
3102:   PetscBool         usecprow=a->compressedrow.use;

3104:   if (!cm || !cn) return 0;
3108:   b = bd->v;
3109:   if (a->nonzerorowcnt != A->rmap->n) {
3110:     MatZeroEntries(C);
3111:   }
3112:   MatDenseGetArray(C,&c);
3113:   switch (bs) {
3114:   case 1:
3115:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3116:     break;
3117:   case 2:
3118:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3119:     break;
3120:   case 3:
3121:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3122:     break;
3123:   case 4:
3124:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3125:     break;
3126:   case 5:
3127:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3128:     break;
3129:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3130:     PetscBLASIntCast(bs,&bbs);
3131:     PetscBLASIntCast(cn,&bcn);
3132:     PetscBLASIntCast(bm,&bbm);
3133:     PetscBLASIntCast(cm,&bcm);
3134:     idx = a->j;
3135:     v   = a->a;
3136:     if (usecprow) {
3137:       mbs  = a->compressedrow.nrows;
3138:       ii   = a->compressedrow.i;
3139:       ridx = a->compressedrow.rindex;
3140:     } else {
3141:       mbs = a->mbs;
3142:       ii  = a->i;
3143:       z   = c;
3144:     }
3145:     for (i=0; i<mbs; i++) {
3146:       n = ii[1] - ii[0]; ii++;
3147:       if (usecprow) z = c + bs*ridx[i];
3148:       if (n) {
3149:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3150:         v += bs2;
3151:       }
3152:       for (j=1; j<n; j++) {
3153:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3154:         v += bs2;
3155:       }
3156:       if (!usecprow) z += bs;
3157:     }
3158:   }
3159:   MatDenseRestoreArray(C,&c);
3160:   PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3161:   return 0;
3162: }