Actual source code: baij2.c

petsc-3.9.4 2018-09-11
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <petsc/private/kernels/blockinvert.h>
  4:  #include <petscbt.h>
  5:  #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8: #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 15:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 16:   const PetscInt *idx;
 17:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 18:   PetscBT        table;

 21:   m  = a->mbs;
 22:   ai = a->i;
 23:   aj = a->j;
 24:   bs = A->rmap->bs;

 26:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 28:   PetscBTCreate(m,&table);
 29:   PetscMalloc1(m+1,&nidx);
 30:   PetscMalloc1(A->rmap->N+1,&nidx2);

 32:   for (i=0; i<is_max; i++) {
 33:     /* Initialise the two local arrays */
 34:     isz  = 0;
 35:     PetscBTMemzero(m,table);

 37:     /* Extract the indices, assume there can be duplicate entries */
 38:     ISGetIndices(is[i],&idx);
 39:     ISGetLocalSize(is[i],&n);

 41:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 42:     for (j=0; j<n; ++j) {
 43:       ival = idx[j]/bs; /* convert the indices into block indices */
 44:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 45:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);

 50:     k = 0;
 51:     for (j=0; j<ov; j++) { /* for each overlap*/
 52:       n = isz;
 53:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 54:         row   = nidx[k];
 55:         start = ai[row];
 56:         end   = ai[row+1];
 57:         for (l = start; l<end; l++) {
 58:           val = aj[l];
 59:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 60:         }
 61:       }
 62:     }
 63:     /* expand the Index Set */
 64:     for (j=0; j<isz; j++) {
 65:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 66:     }
 67:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 68:   }
 69:   PetscBTDestroy(&table);
 70:   PetscFree(nidx);
 71:   PetscFree(nidx2);
 72:   return(0);
 73: }

 75: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 76: {
 77:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 79:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 80:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 81:   const PetscInt *irow,*icol;
 82:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscBool      flag;

 89:   ISGetIndices(isrow,&irow);
 90:   ISGetIndices(iscol,&icol);
 91:   ISGetLocalSize(isrow,&nrows);
 92:   ISGetLocalSize(iscol,&ncols);

 94:   PetscCalloc1(1+oldcols,&smap);
 95:   ssmap = smap;
 96:   PetscMalloc1(1+nrows,&lens);
 97:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 98:   /* determine lens of each row */
 99:   for (i=0; i<nrows; i++) {
100:     kstart  = ai[irow[i]];
101:     kend    = kstart + a->ilen[irow[i]];
102:     lens[i] = 0;
103:     for (k=kstart; k<kend; k++) {
104:       if (ssmap[aj[k]]) lens[i]++;
105:     }
106:   }
107:   /* Create and fill new matrix */
108:   if (scall == MAT_REUSE_MATRIX) {
109:     c = (Mat_SeqBAIJ*)((*B)->data);

111:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
113:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
115:     C    = *B;
116:   } else {
117:     MatCreate(PetscObjectComm((PetscObject)A),&C);
118:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119:     MatSetType(C,((PetscObject)A)->type_name);
120:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
121:   }
122:   c = (Mat_SeqBAIJ*)(C->data);
123:   for (i=0; i<nrows; i++) {
124:     row      = irow[i];
125:     kstart   = ai[row];
126:     kend     = kstart + a->ilen[row];
127:     mat_i    = c->i[i];
128:     mat_j    = c->j + mat_i;
129:     mat_a    = c->a + mat_i*bs2;
130:     mat_ilen = c->ilen + i;
131:     for (k=kstart; k<kend; k++) {
132:       if ((tcol=ssmap[a->j[k]])) {
133:         *mat_j++ = tcol - 1;
134:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135:         mat_a   += bs2;
136:         (*mat_ilen)++;
137:       }
138:     }
139:   }
140:   /* sort */
141:   {
142:     MatScalar *work;
143:     PetscMalloc1(bs2,&work);
144:     for (i=0; i<nrows; i++) {
145:       PetscInt ilen;
146:       mat_i = c->i[i];
147:       mat_j = c->j + mat_i;
148:       mat_a = c->a + mat_i*bs2;
149:       ilen  = c->ilen[i];
150:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151:     }
152:     PetscFree(work);
153:   }

155:   /* Free work space */
156:   ISRestoreIndices(iscol,&icol);
157:   PetscFree(smap);
158:   PetscFree(lens);
159:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

162:   ISRestoreIndices(isrow,&irow);
163:   *B   = C;
164:   return(0);
165: }

167: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168: {
169:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
170:   IS             is1,is2;
172:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173:   const PetscInt *irow,*icol;

176:   ISGetIndices(isrow,&irow);
177:   ISGetIndices(iscol,&icol);
178:   ISGetLocalSize(isrow,&nrows);
179:   ISGetLocalSize(iscol,&ncols);

181:   /* Verify if the indices corespond to each element in a block
182:    and form the IS with compressed IS */
183:   maxmnbs = PetscMax(a->mbs,a->nbs);
184:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
185:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
186:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187:   for (i=0; i<a->mbs; i++) {
188:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189:   }
190:   count = 0;
191:   for (i=0; i<nrows; i++) {
192:     j = irow[i] / bs;
193:     if ((vary[j]--)==bs) iary[count++] = j;
194:   }
195:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

197:   PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
198:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199:   for (i=0; i<a->nbs; i++) {
200:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201:   }
202:   count = 0;
203:   for (i=0; i<ncols; i++) {
204:     j = icol[i] / bs;
205:     if ((vary[j]--)==bs) iary[count++] = j;
206:   }
207:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
208:   ISRestoreIndices(isrow,&irow);
209:   ISRestoreIndices(iscol,&icol);
210:   PetscFree2(vary,iary);

212:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
213:   ISDestroy(&is1);
214:   ISDestroy(&is2);
215:   return(0);
216: }

218: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219: {
221:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
222:   Mat_SubSppt    *submatj = c->submatis1;

225:   submatj->destroy(C);
226:   MatDestroySubMatrix_Private(submatj);
227:   return(0);
228: }

230: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231: {
233:   PetscInt       i;
234:   Mat            C;
235:   Mat_SeqBAIJ    *c;
236:   Mat_SubSppt    *submatj;

239:   for (i=0; i<n; i++) {
240:     C       = (*mat)[i];
241:     c       = (Mat_SeqBAIJ*)C->data;
242:     submatj = c->submatis1;
243:     if (submatj) {
244:       submatj->destroy(C);
245:       MatDestroySubMatrix_Private(submatj);
246:       PetscLayoutDestroy(&C->rmap);
247:       PetscLayoutDestroy(&C->cmap);
248:       PetscHeaderDestroy(&C);
249:     } else {
250:       MatDestroy(&C);
251:     }
252:   }
253:   PetscFree(*mat);
254:   return(0);
255: }

257: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
258: {
260:   PetscInt       i;

263:   if (scall == MAT_INITIAL_MATRIX) {
264:     PetscCalloc1(n+1,B);
265:   }

267:   for (i=0; i<n; i++) {
268:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
269:   }
270:   return(0);
271: }

273: /* -------------------------------------------------------*/
274: /* Should check that shapes of vectors and matrices match */
275: /* -------------------------------------------------------*/

277: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
278: {
279:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
280:   PetscScalar       *z,sum;
281:   const PetscScalar *x;
282:   const MatScalar   *v;
283:   PetscErrorCode    ierr;
284:   PetscInt          mbs,i,n;
285:   const PetscInt    *idx,*ii,*ridx=NULL;
286:   PetscBool         usecprow=a->compressedrow.use;

289:   VecGetArrayRead(xx,&x);
290:   VecGetArray(zz,&z);

292:   if (usecprow) {
293:     mbs  = a->compressedrow.nrows;
294:     ii   = a->compressedrow.i;
295:     ridx = a->compressedrow.rindex;
296:     PetscMemzero(z,a->mbs*sizeof(PetscScalar));
297:   } else {
298:     mbs = a->mbs;
299:     ii  = a->i;
300:   }

302:   for (i=0; i<mbs; i++) {
303:     n   = ii[1] - ii[0];
304:     v   = a->a + ii[0];
305:     idx = a->j + ii[0];
306:     ii++;
307:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
308:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
309:     sum = 0.0;
310:     PetscSparseDensePlusDot(sum,x,v,idx,n);
311:     if (usecprow) {
312:       z[ridx[i]] = sum;
313:     } else {
314:       z[i]       = sum;
315:     }
316:   }
317:   VecRestoreArrayRead(xx,&x);
318:   VecRestoreArray(zz,&z);
319:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
320:   return(0);
321: }

323: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
324: {
325:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
326:   PetscScalar       *z = 0,sum1,sum2,*zarray;
327:   const PetscScalar *x,*xb;
328:   PetscScalar       x1,x2;
329:   const MatScalar   *v;
330:   PetscErrorCode    ierr;
331:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
332:   PetscBool         usecprow=a->compressedrow.use;

335:   VecGetArrayRead(xx,&x);
336:   VecGetArray(zz,&zarray);

338:   idx = a->j;
339:   v   = a->a;
340:   if (usecprow) {
341:     mbs  = a->compressedrow.nrows;
342:     ii   = a->compressedrow.i;
343:     ridx = a->compressedrow.rindex;
344:     PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
345:   } else {
346:     mbs = a->mbs;
347:     ii  = a->i;
348:     z   = zarray;
349:   }

351:   for (i=0; i<mbs; i++) {
352:     n           = ii[1] - ii[0]; ii++;
353:     sum1        = 0.0; sum2 = 0.0;
354:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
355:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356:     for (j=0; j<n; j++) {
357:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
358:       sum1 += v[0]*x1 + v[2]*x2;
359:       sum2 += v[1]*x1 + v[3]*x2;
360:       v    += 4;
361:     }
362:     if (usecprow) z = zarray + 2*ridx[i];
363:     z[0] = sum1; z[1] = sum2;
364:     if (!usecprow) z += 2;
365:   }
366:   VecRestoreArrayRead(xx,&x);
367:   VecRestoreArray(zz,&zarray);
368:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
369:   return(0);
370: }

372: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
373: {
374:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
375:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
376:   const PetscScalar *x,*xb;
377:   const MatScalar   *v;
378:   PetscErrorCode    ierr;
379:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
380:   PetscBool         usecprow=a->compressedrow.use;

382: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383: #pragma disjoint(*v,*z,*xb)
384: #endif

387:   VecGetArrayRead(xx,&x);
388:   VecGetArray(zz,&zarray);

390:   idx = a->j;
391:   v   = a->a;
392:   if (usecprow) {
393:     mbs  = a->compressedrow.nrows;
394:     ii   = a->compressedrow.i;
395:     ridx = a->compressedrow.rindex;
396:     PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
397:   } else {
398:     mbs = a->mbs;
399:     ii  = a->i;
400:     z   = zarray;
401:   }

403:   for (i=0; i<mbs; i++) {
404:     n           = ii[1] - ii[0]; ii++;
405:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
406:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
407:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408:     for (j=0; j<n; j++) {
409:       xb = x + 3*(*idx++);
410:       x1 = xb[0];
411:       x2 = xb[1];
412:       x3 = xb[2];

414:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
415:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
416:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
417:       v    += 9;
418:     }
419:     if (usecprow) z = zarray + 3*ridx[i];
420:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
421:     if (!usecprow) z += 3;
422:   }
423:   VecRestoreArrayRead(xx,&x);
424:   VecRestoreArray(zz,&zarray);
425:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
426:   return(0);
427: }

429: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
430: {
431:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
432:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
433:   const PetscScalar *x,*xb;
434:   const MatScalar   *v;
435:   PetscErrorCode    ierr;
436:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
437:   PetscBool         usecprow=a->compressedrow.use;

440:   VecGetArrayRead(xx,&x);
441:   VecGetArray(zz,&zarray);

443:   idx = a->j;
444:   v   = a->a;
445:   if (usecprow) {
446:     mbs  = a->compressedrow.nrows;
447:     ii   = a->compressedrow.i;
448:     ridx = a->compressedrow.rindex;
449:     PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
450:   } else {
451:     mbs = a->mbs;
452:     ii  = a->i;
453:     z   = zarray;
454:   }

456:   for (i=0; i<mbs; i++) {
457:     n = ii[1] - ii[0];
458:     ii++;
459:     sum1 = 0.0;
460:     sum2 = 0.0;
461:     sum3 = 0.0;
462:     sum4 = 0.0;

464:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
465:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
466:     for (j=0; j<n; j++) {
467:       xb    = x + 4*(*idx++);
468:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
469:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
470:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
471:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
472:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
473:       v    += 16;
474:     }
475:     if (usecprow) z = zarray + 4*ridx[i];
476:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
477:     if (!usecprow) z += 4;
478:   }
479:   VecRestoreArrayRead(xx,&x);
480:   VecRestoreArray(zz,&zarray);
481:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
482:   return(0);
483: }

485: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
486: {
487:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
488:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
489:   const PetscScalar *xb,*x;
490:   const MatScalar   *v;
491:   PetscErrorCode    ierr;
492:   const PetscInt    *idx,*ii,*ridx=NULL;
493:   PetscInt          mbs,i,j,n;
494:   PetscBool         usecprow=a->compressedrow.use;

497:   VecGetArrayRead(xx,&x);
498:   VecGetArray(zz,&zarray);

500:   idx = a->j;
501:   v   = a->a;
502:   if (usecprow) {
503:     mbs  = a->compressedrow.nrows;
504:     ii   = a->compressedrow.i;
505:     ridx = a->compressedrow.rindex;
506:     PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
507:   } else {
508:     mbs = a->mbs;
509:     ii  = a->i;
510:     z   = zarray;
511:   }

513:   for (i=0; i<mbs; i++) {
514:     n           = ii[1] - ii[0]; ii++;
515:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
516:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
517:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
518:     for (j=0; j<n; j++) {
519:       xb    = x + 5*(*idx++);
520:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
521:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
522:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
523:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
524:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
525:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
526:       v    += 25;
527:     }
528:     if (usecprow) z = zarray + 5*ridx[i];
529:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
530:     if (!usecprow) z += 5;
531:   }
532:   VecRestoreArrayRead(xx,&x);
533:   VecRestoreArray(zz,&zarray);
534:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
535:   return(0);
536: }



540: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541: {
542:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
543:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
544:   const PetscScalar *x,*xb;
545:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
546:   const MatScalar   *v;
547:   PetscErrorCode    ierr;
548:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
549:   PetscBool         usecprow=a->compressedrow.use;

552:   VecGetArrayRead(xx,&x);
553:   VecGetArray(zz,&zarray);

555:   idx = a->j;
556:   v   = a->a;
557:   if (usecprow) {
558:     mbs  = a->compressedrow.nrows;
559:     ii   = a->compressedrow.i;
560:     ridx = a->compressedrow.rindex;
561:     PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
562:   } else {
563:     mbs = a->mbs;
564:     ii  = a->i;
565:     z   = zarray;
566:   }

568:   for (i=0; i<mbs; i++) {
569:     n  = ii[1] - ii[0];
570:     ii++;
571:     sum1 = 0.0;
572:     sum2 = 0.0;
573:     sum3 = 0.0;
574:     sum4 = 0.0;
575:     sum5 = 0.0;
576:     sum6 = 0.0;

578:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
579:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580:     for (j=0; j<n; j++) {
581:       xb    = x + 6*(*idx++);
582:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
584:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
585:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
586:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
587:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
588:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
589:       v    += 36;
590:     }
591:     if (usecprow) z = zarray + 6*ridx[i];
592:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593:     if (!usecprow) z += 6;
594:   }

596:   VecRestoreArrayRead(xx,&x);
597:   VecRestoreArray(zz,&zarray);
598:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
599:   return(0);
600: }

602: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603: {
604:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606:   const PetscScalar *x,*xb;
607:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
608:   const MatScalar   *v;
609:   PetscErrorCode    ierr;
610:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
611:   PetscBool         usecprow=a->compressedrow.use;

614:   VecGetArrayRead(xx,&x);
615:   VecGetArray(zz,&zarray);

617:   idx = a->j;
618:   v   = a->a;
619:   if (usecprow) {
620:     mbs  = a->compressedrow.nrows;
621:     ii   = a->compressedrow.i;
622:     ridx = a->compressedrow.rindex;
623:     PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
624:   } else {
625:     mbs = a->mbs;
626:     ii  = a->i;
627:     z   = zarray;
628:   }

630:   for (i=0; i<mbs; i++) {
631:     n  = ii[1] - ii[0];
632:     ii++;
633:     sum1 = 0.0;
634:     sum2 = 0.0;
635:     sum3 = 0.0;
636:     sum4 = 0.0;
637:     sum5 = 0.0;
638:     sum6 = 0.0;
639:     sum7 = 0.0;

641:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
642:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643:     for (j=0; j<n; j++) {
644:       xb    = x + 7*(*idx++);
645:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653:       v    += 49;
654:     }
655:     if (usecprow) z = zarray + 7*ridx[i];
656:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657:     if (!usecprow) z += 7;
658:   }

660:   VecRestoreArrayRead(xx,&x);
661:   VecRestoreArray(zz,&zarray);
662:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
663:   return(0);
664: }

666: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668: {
669:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
670:   PetscScalar    *z = 0,*work,*workt,*zarray;
671:   const PetscScalar *x,*xb;
672:   const MatScalar   *v;
674:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675:   const PetscInt *idx,*ii,*ridx=NULL;
676:   PetscInt       k;
677:   PetscBool      usecprow=a->compressedrow.use;

679:   __m256d a0,a1,a2,a3,a4,a5;
680:   __m256d w0,w1,w2,w3;
681:   __m256d z0,z1,z2;
682:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

685:   VecGetArrayRead(xx,&x);
686:   VecGetArray(zz,&zarray);

688:   idx = a->j;
689:   v   = a->a;
690:   if (usecprow) {
691:     mbs  = a->compressedrow.nrows;
692:     ii   = a->compressedrow.i;
693:     ridx = a->compressedrow.rindex;
694:     PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
695:   } else {
696:     mbs = a->mbs;
697:     ii  = a->i;
698:     z   = zarray;
699:   }

701:   if (!a->mult_work) {
702:     k    = PetscMax(A->rmap->n,A->cmap->n);
703:     PetscMalloc1(k+1,&a->mult_work);
704:   }

706:   work = a->mult_work;
707:   for (i=0; i<mbs; i++) {
708:     n           = ii[1] - ii[0]; ii++;
709:     workt       = work;
710:     for (j=0; j<n; j++) {
711:       xb = x + bs*(*idx++);
712:       for (k=0; k<bs; k++) workt[k] = xb[k];
713:       workt += bs;
714:     }
715:     if (usecprow) z = zarray + bs*ridx[i];

717:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

719:     for (j=0; j<n; j++) {
720:       // first column of a
721:       w0 = _mm256_set1_pd(work[j*9  ]);
722:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

726:       // second column of a
727:       w1 = _mm256_set1_pd(work[j*9+ 1]);
728:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

732:       // third column of a
733:       w2 = _mm256_set1_pd(work[j*9 +2]);
734:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

738:       // fourth column of a
739:       w3 = _mm256_set1_pd(work[j*9+ 3]);
740:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

744:       // fifth column of a
745:       w0 = _mm256_set1_pd(work[j*9+ 4]);
746:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

750:       // sixth column of a
751:       w1 = _mm256_set1_pd(work[j*9+ 5]);
752:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

756:       // seventh column of a
757:       w2 = _mm256_set1_pd(work[j*9+ 6]);
758:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

762:       // eigth column of a
763:       w3 = _mm256_set1_pd(work[j*9+ 7]);
764:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

768:       // ninth column of a
769:       w0 = _mm256_set1_pd(work[j*9+ 8]);
770:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773:     }

775:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

777:     v += n*bs2;
778:     if (!usecprow) z += bs;
779:   }
780:   VecRestoreArrayRead(xx,&x);
781:   VecRestoreArray(zz,&zarray);
782:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
783:   return(0);
784: }
785: #endif

787: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788: {
789:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
790:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791:   const PetscScalar *x,*xb;
792:   PetscScalar       *zarray,xv;
793:   const MatScalar   *v;
794:   PetscErrorCode    ierr;
795:   const PetscInt    *ii,*ij=a->j,*idx;
796:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
797:   PetscBool         usecprow=a->compressedrow.use;

800:   VecGetArrayRead(xx,&x);
801:   VecGetArray(zz,&zarray);

803:   v = a->a;
804:   if (usecprow) {
805:     mbs  = a->compressedrow.nrows;
806:     ii   = a->compressedrow.i;
807:     ridx = a->compressedrow.rindex;
808:     PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));
809:   } else {
810:     mbs = a->mbs;
811:     ii  = a->i;
812:     z   = zarray;
813:   }

815:   for (i=0; i<mbs; i++) {
816:     n    = ii[i+1] - ii[i];
817:     idx  = ij + ii[i];
818:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

821:     for (j=0; j<n; j++) {
822:       xb = x + 11*(idx[j]);

824:       for (k=0; k<11; k++) {
825:         xv     =  xb[k];
826:         sum1  += v[0]*xv;
827:         sum2  += v[1]*xv;
828:         sum3  += v[2]*xv;
829:         sum4  += v[3]*xv;
830:         sum5  += v[4]*xv;
831:         sum6  += v[5]*xv;
832:         sum7  += v[6]*xv;
833:         sum8  += v[7]*xv;
834:         sum9  += v[8]*xv;
835:         sum10 += v[9]*xv;
836:         sum11 += v[10]*xv;
837:         v     += 11;
838:       }
839:     }
840:     if (usecprow) z = zarray + 11*ridx[i];
841:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

844:     if (!usecprow) z += 11;
845:   }

847:   VecRestoreArrayRead(xx,&x);
848:   VecRestoreArray(zz,&zarray);
849:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
850:   return(0);
851: }

853: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854: /* Default MatMult for block size 15 */

856: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
857: {
858:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
859:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
860:   const PetscScalar *x,*xb;
861:   PetscScalar       *zarray,xv;
862:   const MatScalar   *v;
863:   PetscErrorCode    ierr;
864:   const PetscInt    *ii,*ij=a->j,*idx;
865:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
866:   PetscBool         usecprow=a->compressedrow.use;

869:   VecGetArrayRead(xx,&x);
870:   VecGetArray(zz,&zarray);

872:   v = a->a;
873:   if (usecprow) {
874:     mbs  = a->compressedrow.nrows;
875:     ii   = a->compressedrow.i;
876:     ridx = a->compressedrow.rindex;
877:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
878:   } else {
879:     mbs = a->mbs;
880:     ii  = a->i;
881:     z   = zarray;
882:   }

884:   for (i=0; i<mbs; i++) {
885:     n    = ii[i+1] - ii[i];
886:     idx  = ij + ii[i];
887:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
888:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

890:     for (j=0; j<n; j++) {
891:       xb = x + 15*(idx[j]);

893:       for (k=0; k<15; k++) {
894:         xv     =  xb[k];
895:         sum1  += v[0]*xv;
896:         sum2  += v[1]*xv;
897:         sum3  += v[2]*xv;
898:         sum4  += v[3]*xv;
899:         sum5  += v[4]*xv;
900:         sum6  += v[5]*xv;
901:         sum7  += v[6]*xv;
902:         sum8  += v[7]*xv;
903:         sum9  += v[8]*xv;
904:         sum10 += v[9]*xv;
905:         sum11 += v[10]*xv;
906:         sum12 += v[11]*xv;
907:         sum13 += v[12]*xv;
908:         sum14 += v[13]*xv;
909:         sum15 += v[14]*xv;
910:         v     += 15;
911:       }
912:     }
913:     if (usecprow) z = zarray + 15*ridx[i];
914:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

917:     if (!usecprow) z += 15;
918:   }

920:   VecRestoreArrayRead(xx,&x);
921:   VecRestoreArray(zz,&zarray);
922:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
923:   return(0);
924: }

926: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
927: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
928: {
929:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
930:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931:   const PetscScalar *x,*xb;
932:   PetscScalar       x1,x2,x3,x4,*zarray;
933:   const MatScalar   *v;
934:   PetscErrorCode    ierr;
935:   const PetscInt    *ii,*ij=a->j,*idx;
936:   PetscInt          mbs,i,j,n,*ridx=NULL;
937:   PetscBool         usecprow=a->compressedrow.use;

940:   VecGetArrayRead(xx,&x);
941:   VecGetArray(zz,&zarray);

943:   v = a->a;
944:   if (usecprow) {
945:     mbs  = a->compressedrow.nrows;
946:     ii   = a->compressedrow.i;
947:     ridx = a->compressedrow.rindex;
948:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
949:   } else {
950:     mbs = a->mbs;
951:     ii  = a->i;
952:     z   = zarray;
953:   }

955:   for (i=0; i<mbs; i++) {
956:     n    = ii[i+1] - ii[i];
957:     idx  = ij + ii[i];
958:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

961:     for (j=0; j<n; j++) {
962:       xb = x + 15*(idx[j]);
963:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

965:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
966:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
967:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
968:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
969:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
970:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
971:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
972:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
973:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
974:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
975:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
976:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
977:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
978:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
979:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

981:       v += 60;

983:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

985:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
986:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
987:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
988:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
989:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
990:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
991:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
992:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
993:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
994:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
995:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
996:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
997:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
998:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
999:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1000:       v     += 60;

1002:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1003:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1004:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1005:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1006:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1007:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1008:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1009:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1010:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1011:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1012:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1013:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1014:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1015:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1016:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1017:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1018:       v     += 60;

1020:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1021:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1022:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1023:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1024:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1025:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1026:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1027:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1028:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1029:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1030:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1031:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1032:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1033:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1034:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1035:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1036:       v     += 45;
1037:     }
1038:     if (usecprow) z = zarray + 15*ridx[i];
1039:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1040:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1042:     if (!usecprow) z += 15;
1043:   }

1045:   VecRestoreArrayRead(xx,&x);
1046:   VecRestoreArray(zz,&zarray);
1047:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1048:   return(0);
1049: }

1051: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1052: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1053: {
1054:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1055:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1056:   const PetscScalar *x,*xb;
1057:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1058:   const MatScalar   *v;
1059:   PetscErrorCode    ierr;
1060:   const PetscInt    *ii,*ij=a->j,*idx;
1061:   PetscInt          mbs,i,j,n,*ridx=NULL;
1062:   PetscBool         usecprow=a->compressedrow.use;

1065:   VecGetArrayRead(xx,&x);
1066:   VecGetArray(zz,&zarray);

1068:   v = a->a;
1069:   if (usecprow) {
1070:     mbs  = a->compressedrow.nrows;
1071:     ii   = a->compressedrow.i;
1072:     ridx = a->compressedrow.rindex;
1073:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1074:   } else {
1075:     mbs = a->mbs;
1076:     ii  = a->i;
1077:     z   = zarray;
1078:   }

1080:   for (i=0; i<mbs; i++) {
1081:     n    = ii[i+1] - ii[i];
1082:     idx  = ij + ii[i];
1083:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1084:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1086:     for (j=0; j<n; j++) {
1087:       xb = x + 15*(idx[j]);
1088:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1089:       x8 = xb[7];

1091:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1092:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1093:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1094:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1095:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1096:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1097:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1098:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1099:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1100:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1101:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1102:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1103:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1104:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1105:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1106:       v     += 120;

1108:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1110:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1111:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1112:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1113:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1114:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1115:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1116:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1117:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1118:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1119:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1120:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1121:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1122:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1123:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1124:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1125:       v     += 105;
1126:     }
1127:     if (usecprow) z = zarray + 15*ridx[i];
1128:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1129:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1131:     if (!usecprow) z += 15;
1132:   }

1134:   VecRestoreArrayRead(xx,&x);
1135:   VecRestoreArray(zz,&zarray);
1136:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1137:   return(0);
1138: }

1140: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

1142: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1143: {
1144:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1145:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1146:   const PetscScalar *x,*xb;
1147:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1148:   const MatScalar   *v;
1149:   PetscErrorCode    ierr;
1150:   const PetscInt    *ii,*ij=a->j,*idx;
1151:   PetscInt          mbs,i,j,n,*ridx=NULL;
1152:   PetscBool         usecprow=a->compressedrow.use;

1155:   VecGetArrayRead(xx,&x);
1156:   VecGetArray(zz,&zarray);

1158:   v = a->a;
1159:   if (usecprow) {
1160:     mbs  = a->compressedrow.nrows;
1161:     ii   = a->compressedrow.i;
1162:     ridx = a->compressedrow.rindex;
1163:     PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1164:   } else {
1165:     mbs = a->mbs;
1166:     ii  = a->i;
1167:     z   = zarray;
1168:   }

1170:   for (i=0; i<mbs; i++) {
1171:     n    = ii[i+1] - ii[i];
1172:     idx  = ij + ii[i];
1173:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1174:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1176:     for (j=0; j<n; j++) {
1177:       xb = x + 15*(idx[j]);
1178:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1179:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1181:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1182:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1183:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1184:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1185:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1186:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1187:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1188:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1189:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1190:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1191:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1192:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1193:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1194:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1195:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1196:       v     += 225;
1197:     }
1198:     if (usecprow) z = zarray + 15*ridx[i];
1199:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1200:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1202:     if (!usecprow) z += 15;
1203:   }

1205:   VecRestoreArrayRead(xx,&x);
1206:   VecRestoreArray(zz,&zarray);
1207:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1208:   return(0);
1209: }


1212: /*
1213:     This will not work with MatScalar == float because it calls the BLAS
1214: */
1215: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1216: {
1217:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1218:   PetscScalar       *z = 0,*work,*workt,*zarray;
1219:   const PetscScalar *x,*xb;
1220:   const MatScalar   *v;
1221:   PetscErrorCode    ierr;
1222:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1223:   const PetscInt    *idx,*ii,*ridx=NULL;
1224:   PetscInt          ncols,k;
1225:   PetscBool         usecprow=a->compressedrow.use;

1228:   VecGetArrayRead(xx,&x);
1229:   VecGetArray(zz,&zarray);

1231:   idx = a->j;
1232:   v   = a->a;
1233:   if (usecprow) {
1234:     mbs  = a->compressedrow.nrows;
1235:     ii   = a->compressedrow.i;
1236:     ridx = a->compressedrow.rindex;
1237:     PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1238:   } else {
1239:     mbs = a->mbs;
1240:     ii  = a->i;
1241:     z   = zarray;
1242:   }

1244:   if (!a->mult_work) {
1245:     k    = PetscMax(A->rmap->n,A->cmap->n);
1246:     PetscMalloc1(k+1,&a->mult_work);
1247:   }
1248:   work = a->mult_work;
1249:   for (i=0; i<mbs; i++) {
1250:     n           = ii[1] - ii[0]; ii++;
1251:     ncols       = n*bs;
1252:     workt       = work;
1253:     for (j=0; j<n; j++) {
1254:       xb = x + bs*(*idx++);
1255:       for (k=0; k<bs; k++) workt[k] = xb[k];
1256:       workt += bs;
1257:     }
1258:     if (usecprow) z = zarray + bs*ridx[i];
1259:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1260:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1261:     v += n*bs2;
1262:     if (!usecprow) z += bs;
1263:   }
1264:   VecRestoreArrayRead(xx,&x);
1265:   VecRestoreArray(zz,&zarray);
1266:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1267:   return(0);
1268: }

1270: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1271: {
1272:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1273:   const PetscScalar *x;
1274:   PetscScalar       *y,*z,sum;
1275:   const MatScalar   *v;
1276:   PetscErrorCode    ierr;
1277:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1278:   const PetscInt    *idx,*ii;
1279:   PetscBool         usecprow=a->compressedrow.use;

1282:   VecGetArrayRead(xx,&x);
1283:   VecGetArrayPair(yy,zz,&y,&z);

1285:   idx = a->j;
1286:   v   = a->a;
1287:   if (usecprow) {
1288:     if (zz != yy) {
1289:       PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1290:     }
1291:     mbs  = a->compressedrow.nrows;
1292:     ii   = a->compressedrow.i;
1293:     ridx = a->compressedrow.rindex;
1294:   } else {
1295:     ii = a->i;
1296:   }

1298:   for (i=0; i<mbs; i++) {
1299:     n = ii[1] - ii[0];
1300:     ii++;
1301:     if (!usecprow) {
1302:       sum         = y[i];
1303:     } else {
1304:       sum = y[ridx[i]];
1305:     }
1306:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1307:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1308:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1309:     v   += n;
1310:     idx += n;
1311:     if (usecprow) {
1312:       z[ridx[i]] = sum;
1313:     } else {
1314:       z[i] = sum;
1315:     }
1316:   }
1317:   VecRestoreArrayRead(xx,&x);
1318:   VecRestoreArrayPair(yy,zz,&y,&z);
1319:   PetscLogFlops(2.0*a->nz);
1320:   return(0);
1321: }

1323: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1324: {
1325:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1326:   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1327:   const PetscScalar *x,*xb;
1328:   PetscScalar       x1,x2,*yarray,*zarray;
1329:   const MatScalar   *v;
1330:   PetscErrorCode    ierr;
1331:   PetscInt          mbs = a->mbs,i,n,j;
1332:   const PetscInt    *idx,*ii,*ridx = NULL;
1333:   PetscBool         usecprow = a->compressedrow.use;

1336:   VecGetArrayRead(xx,&x);
1337:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1339:   idx = a->j;
1340:   v   = a->a;
1341:   if (usecprow) {
1342:     if (zz != yy) {
1343:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1344:     }
1345:     mbs  = a->compressedrow.nrows;
1346:     ii   = a->compressedrow.i;
1347:     ridx = a->compressedrow.rindex;
1348:   } else {
1349:     ii = a->i;
1350:     y  = yarray;
1351:     z  = zarray;
1352:   }

1354:   for (i=0; i<mbs; i++) {
1355:     n = ii[1] - ii[0]; ii++;
1356:     if (usecprow) {
1357:       z = zarray + 2*ridx[i];
1358:       y = yarray + 2*ridx[i];
1359:     }
1360:     sum1 = y[0]; sum2 = y[1];
1361:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1362:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363:     for (j=0; j<n; j++) {
1364:       xb = x + 2*(*idx++);
1365:       x1 = xb[0];
1366:       x2 = xb[1];

1368:       sum1 += v[0]*x1 + v[2]*x2;
1369:       sum2 += v[1]*x1 + v[3]*x2;
1370:       v    += 4;
1371:     }
1372:     z[0] = sum1; z[1] = sum2;
1373:     if (!usecprow) {
1374:       z += 2; y += 2;
1375:     }
1376:   }
1377:   VecRestoreArrayRead(xx,&x);
1378:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1379:   PetscLogFlops(4.0*a->nz);
1380:   return(0);
1381: }

1383: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1384: {
1385:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1386:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1387:   const PetscScalar *x,*xb;
1388:   const MatScalar   *v;
1389:   PetscErrorCode    ierr;
1390:   PetscInt          mbs = a->mbs,i,j,n;
1391:   const PetscInt    *idx,*ii,*ridx = NULL;
1392:   PetscBool         usecprow = a->compressedrow.use;

1395:   VecGetArrayRead(xx,&x);
1396:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1398:   idx = a->j;
1399:   v   = a->a;
1400:   if (usecprow) {
1401:     if (zz != yy) {
1402:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1403:     }
1404:     mbs  = a->compressedrow.nrows;
1405:     ii   = a->compressedrow.i;
1406:     ridx = a->compressedrow.rindex;
1407:   } else {
1408:     ii = a->i;
1409:     y  = yarray;
1410:     z  = zarray;
1411:   }

1413:   for (i=0; i<mbs; i++) {
1414:     n = ii[1] - ii[0]; ii++;
1415:     if (usecprow) {
1416:       z = zarray + 3*ridx[i];
1417:       y = yarray + 3*ridx[i];
1418:     }
1419:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1420:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1421:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1422:     for (j=0; j<n; j++) {
1423:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1424:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1425:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1426:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1427:       v    += 9;
1428:     }
1429:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1430:     if (!usecprow) {
1431:       z += 3; y += 3;
1432:     }
1433:   }
1434:   VecRestoreArrayRead(xx,&x);
1435:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1436:   PetscLogFlops(18.0*a->nz);
1437:   return(0);
1438: }

1440: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1441: {
1442:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1443:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1444:   const PetscScalar *x,*xb;
1445:   const MatScalar   *v;
1446:   PetscErrorCode    ierr;
1447:   PetscInt          mbs = a->mbs,i,j,n;
1448:   const PetscInt    *idx,*ii,*ridx=NULL;
1449:   PetscBool         usecprow=a->compressedrow.use;

1452:   VecGetArrayRead(xx,&x);
1453:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1455:   idx = a->j;
1456:   v   = a->a;
1457:   if (usecprow) {
1458:     if (zz != yy) {
1459:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1460:     }
1461:     mbs  = a->compressedrow.nrows;
1462:     ii   = a->compressedrow.i;
1463:     ridx = a->compressedrow.rindex;
1464:   } else {
1465:     ii = a->i;
1466:     y  = yarray;
1467:     z  = zarray;
1468:   }

1470:   for (i=0; i<mbs; i++) {
1471:     n = ii[1] - ii[0]; ii++;
1472:     if (usecprow) {
1473:       z = zarray + 4*ridx[i];
1474:       y = yarray + 4*ridx[i];
1475:     }
1476:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1477:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1478:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1479:     for (j=0; j<n; j++) {
1480:       xb    = x + 4*(*idx++);
1481:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1482:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1483:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1484:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1485:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1486:       v    += 16;
1487:     }
1488:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1489:     if (!usecprow) {
1490:       z += 4; y += 4;
1491:     }
1492:   }
1493:   VecRestoreArrayRead(xx,&x);
1494:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1495:   PetscLogFlops(32.0*a->nz);
1496:   return(0);
1497: }

1499: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1500: {
1501:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1502:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1503:   const PetscScalar *x,*xb;
1504:   PetscScalar       *yarray,*zarray;
1505:   const MatScalar   *v;
1506:   PetscErrorCode    ierr;
1507:   PetscInt          mbs = a->mbs,i,j,n;
1508:   const PetscInt    *idx,*ii,*ridx = NULL;
1509:   PetscBool         usecprow=a->compressedrow.use;

1512:   VecGetArrayRead(xx,&x);
1513:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1515:   idx = a->j;
1516:   v   = a->a;
1517:   if (usecprow) {
1518:     if (zz != yy) {
1519:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1520:     }
1521:     mbs  = a->compressedrow.nrows;
1522:     ii   = a->compressedrow.i;
1523:     ridx = a->compressedrow.rindex;
1524:   } else {
1525:     ii = a->i;
1526:     y  = yarray;
1527:     z  = zarray;
1528:   }

1530:   for (i=0; i<mbs; i++) {
1531:     n = ii[1] - ii[0]; ii++;
1532:     if (usecprow) {
1533:       z = zarray + 5*ridx[i];
1534:       y = yarray + 5*ridx[i];
1535:     }
1536:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1537:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1538:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1539:     for (j=0; j<n; j++) {
1540:       xb    = x + 5*(*idx++);
1541:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1542:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1543:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1544:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1545:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1546:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1547:       v    += 25;
1548:     }
1549:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1550:     if (!usecprow) {
1551:       z += 5; y += 5;
1552:     }
1553:   }
1554:   VecRestoreArrayRead(xx,&x);
1555:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1556:   PetscLogFlops(50.0*a->nz);
1557:   return(0);
1558: }
1559: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1560: {
1561:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1562:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1563:   const PetscScalar *x,*xb;
1564:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1565:   const MatScalar   *v;
1566:   PetscErrorCode    ierr;
1567:   PetscInt          mbs = a->mbs,i,j,n;
1568:   const PetscInt    *idx,*ii,*ridx=NULL;
1569:   PetscBool         usecprow=a->compressedrow.use;

1572:   VecGetArrayRead(xx,&x);
1573:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1575:   idx = a->j;
1576:   v   = a->a;
1577:   if (usecprow) {
1578:     if (zz != yy) {
1579:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1580:     }
1581:     mbs  = a->compressedrow.nrows;
1582:     ii   = a->compressedrow.i;
1583:     ridx = a->compressedrow.rindex;
1584:   } else {
1585:     ii = a->i;
1586:     y  = yarray;
1587:     z  = zarray;
1588:   }

1590:   for (i=0; i<mbs; i++) {
1591:     n = ii[1] - ii[0]; ii++;
1592:     if (usecprow) {
1593:       z = zarray + 6*ridx[i];
1594:       y = yarray + 6*ridx[i];
1595:     }
1596:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1597:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1598:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1599:     for (j=0; j<n; j++) {
1600:       xb    = x + 6*(*idx++);
1601:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1602:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1603:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1604:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1605:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1606:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1607:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1608:       v    += 36;
1609:     }
1610:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1611:     if (!usecprow) {
1612:       z += 6; y += 6;
1613:     }
1614:   }
1615:   VecRestoreArrayRead(xx,&x);
1616:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1617:   PetscLogFlops(72.0*a->nz);
1618:   return(0);
1619: }

1621: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1622: {
1623:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1624:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1625:   const PetscScalar *x,*xb;
1626:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1627:   const MatScalar   *v;
1628:   PetscErrorCode    ierr;
1629:   PetscInt          mbs = a->mbs,i,j,n;
1630:   const PetscInt    *idx,*ii,*ridx = NULL;
1631:   PetscBool         usecprow=a->compressedrow.use;

1634:   VecGetArrayRead(xx,&x);
1635:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1637:   idx = a->j;
1638:   v   = a->a;
1639:   if (usecprow) {
1640:     if (zz != yy) {
1641:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1642:     }
1643:     mbs  = a->compressedrow.nrows;
1644:     ii   = a->compressedrow.i;
1645:     ridx = a->compressedrow.rindex;
1646:   } else {
1647:     ii = a->i;
1648:     y  = yarray;
1649:     z  = zarray;
1650:   }

1652:   for (i=0; i<mbs; i++) {
1653:     n = ii[1] - ii[0]; ii++;
1654:     if (usecprow) {
1655:       z = zarray + 7*ridx[i];
1656:       y = yarray + 7*ridx[i];
1657:     }
1658:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1659:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1660:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1661:     for (j=0; j<n; j++) {
1662:       xb    = x + 7*(*idx++);
1663:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1664:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1665:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1666:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1667:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1668:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1669:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1670:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1671:       v    += 49;
1672:     }
1673:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1674:     if (!usecprow) {
1675:       z += 7; y += 7;
1676:     }
1677:   }
1678:   VecRestoreArrayRead(xx,&x);
1679:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1680:   PetscLogFlops(98.0*a->nz);
1681:   return(0);
1682: }

1684: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1685: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1686: {
1687:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1688:   PetscScalar    *z = 0,*work,*workt,*zarray;
1689:   const PetscScalar *x,*xb;
1690:   const MatScalar   *v;
1692:   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1693:   PetscInt       k;
1694:   PetscBool      usecprow=a->compressedrow.use;
1695:   const PetscInt *idx,*ii,*ridx=NULL;

1697:   __m256d a0,a1,a2,a3,a4,a5;
1698:   __m256d w0,w1,w2,w3;
1699:   __m256d z0,z1,z2;
1700:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

1703:   VecCopy(yy,zz);
1704:   VecGetArrayRead(xx,&x);
1705:   VecGetArray(zz,&zarray);

1707:   idx = a->j;
1708:   v   = a->a;
1709:   if (usecprow) {
1710:     mbs  = a->compressedrow.nrows;
1711:     ii   = a->compressedrow.i;
1712:     ridx = a->compressedrow.rindex;
1713:   } else {
1714:     mbs = a->mbs;
1715:     ii  = a->i;
1716:     z   = zarray;
1717:   }

1719:   if (!a->mult_work) {
1720:     k    = PetscMax(A->rmap->n,A->cmap->n);
1721:     PetscMalloc1(k+1,&a->mult_work);
1722:   }

1724:   work = a->mult_work;
1725:   for (i=0; i<mbs; i++) {
1726:     n           = ii[1] - ii[0]; ii++;
1727:     workt       = work;
1728:     for (j=0; j<n; j++) {
1729:       xb = x + bs*(*idx++);
1730:       for (k=0; k<bs; k++) workt[k] = xb[k];
1731:       workt += bs;
1732:     }
1733:     if (usecprow) z = zarray + bs*ridx[i];

1735:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

1737:     for (j=0; j<n; j++) {
1738:       // first column of a
1739:       w0 = _mm256_set1_pd(work[j*9  ]);
1740:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1741:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1742:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

1744:       // second column of a
1745:       w1 = _mm256_set1_pd(work[j*9+ 1]);
1746:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1747:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1748:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1750:       // third column of a
1751:       w2 = _mm256_set1_pd(work[j*9+ 2]);
1752:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1753:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1754:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

1756:       // fourth column of a
1757:       w3 = _mm256_set1_pd(work[j*9+ 3]);
1758:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1759:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1760:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

1762:       // fifth column of a
1763:       w0 = _mm256_set1_pd(work[j*9+ 4]);
1764:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1765:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1766:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

1768:       // sixth column of a
1769:       w1 = _mm256_set1_pd(work[j*9+ 5]);
1770:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1771:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1772:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

1774:       // seventh column of a
1775:       w2 = _mm256_set1_pd(work[j*9+ 6]);
1776:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1777:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1778:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

1780:       // eigth column of a
1781:       w3 = _mm256_set1_pd(work[j*9+ 7]);
1782:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1783:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1784:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

1786:       // ninth column of a
1787:       w0 = _mm256_set1_pd(work[j*9+ 8]);
1788:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1789:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1790:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1791:     }

1793:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

1795:     v += n*bs2;
1796:     if (!usecprow) z += bs;
1797:   }
1798:   VecRestoreArrayRead(xx,&x);
1799:   VecRestoreArray(zz,&zarray);
1800:   PetscLogFlops(162.0*a->nz);
1801:   return(0);
1802: }
1803: #endif

1805: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1806: {
1807:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1808:   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1809:   const PetscScalar *x,*xb;
1810:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1811:   const MatScalar   *v;
1812:   PetscErrorCode    ierr;
1813:   PetscInt          mbs = a->mbs,i,j,n;
1814:   const PetscInt    *idx,*ii,*ridx = NULL;
1815:   PetscBool         usecprow=a->compressedrow.use;

1818:   VecGetArrayRead(xx,&x);
1819:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1821:   idx = a->j;
1822:   v   = a->a;
1823:   if (usecprow) {
1824:     if (zz != yy) {
1825:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1826:     }
1827:     mbs  = a->compressedrow.nrows;
1828:     ii   = a->compressedrow.i;
1829:     ridx = a->compressedrow.rindex;
1830:   } else {
1831:     ii = a->i;
1832:     y  = yarray;
1833:     z  = zarray;
1834:   }

1836:   for (i=0; i<mbs; i++) {
1837:     n = ii[1] - ii[0]; ii++;
1838:     if (usecprow) {
1839:       z = zarray + 11*ridx[i];
1840:       y = yarray + 11*ridx[i];
1841:     }
1842:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1843:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1844:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1845:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1846:     for (j=0; j<n; j++) {
1847:       xb    = x + 11*(*idx++);
1848:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1849:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1850:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1851:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1852:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1853:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1854:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1855:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1856:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1857:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1858:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1859:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1860:       v    += 121;
1861:     }
1862:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1863:     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1864:     if (!usecprow) {
1865:       z += 11; y += 11;
1866:     }
1867:   }
1868:   VecRestoreArrayRead(xx,&x);
1869:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1870:   PetscLogFlops(242.0*a->nz);
1871:   return(0);
1872: }

1874: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1875: {
1876:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1877:   PetscScalar       *z = 0,*work,*workt,*zarray;
1878:   const PetscScalar *x,*xb;
1879:   const MatScalar   *v;
1880:   PetscErrorCode    ierr;
1881:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1882:   PetscInt          ncols,k;
1883:   const PetscInt    *ridx = NULL,*idx,*ii;
1884:   PetscBool         usecprow = a->compressedrow.use;

1887:   VecCopy(yy,zz);
1888:   VecGetArrayRead(xx,&x);
1889:   VecGetArray(zz,&zarray);

1891:   idx = a->j;
1892:   v   = a->a;
1893:   if (usecprow) {
1894:     mbs  = a->compressedrow.nrows;
1895:     ii   = a->compressedrow.i;
1896:     ridx = a->compressedrow.rindex;
1897:   } else {
1898:     mbs = a->mbs;
1899:     ii  = a->i;
1900:     z   = zarray;
1901:   }

1903:   if (!a->mult_work) {
1904:     k    = PetscMax(A->rmap->n,A->cmap->n);
1905:     PetscMalloc1(k+1,&a->mult_work);
1906:   }
1907:   work = a->mult_work;
1908:   for (i=0; i<mbs; i++) {
1909:     n     = ii[1] - ii[0]; ii++;
1910:     ncols = n*bs;
1911:     workt = work;
1912:     for (j=0; j<n; j++) {
1913:       xb = x + bs*(*idx++);
1914:       for (k=0; k<bs; k++) workt[k] = xb[k];
1915:       workt += bs;
1916:     }
1917:     if (usecprow) z = zarray + bs*ridx[i];
1918:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1919:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1920:     v += n*bs2;
1921:     if (!usecprow) z += bs;
1922:   }
1923:   VecRestoreArrayRead(xx,&x);
1924:   VecRestoreArray(zz,&zarray);
1925:   PetscLogFlops(2.0*a->nz*bs2);
1926:   return(0);
1927: }

1929: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1930: {
1931:   PetscScalar    zero = 0.0;

1935:   VecSet(zz,zero);
1936:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1937:   return(0);
1938: }

1940: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1941: {
1942:   PetscScalar    zero = 0.0;

1946:   VecSet(zz,zero);
1947:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1948:   return(0);
1949: }

1951: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1952: {
1953:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1954:   PetscScalar       *z,x1,x2,x3,x4,x5;
1955:   const PetscScalar *x,*xb = NULL;
1956:   const MatScalar   *v;
1957:   PetscErrorCode    ierr;
1958:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1959:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1960:   Mat_CompressedRow cprow = a->compressedrow;
1961:   PetscBool         usecprow = cprow.use;

1964:   if (yy != zz) { VecCopy(yy,zz); }
1965:   VecGetArrayRead(xx,&x);
1966:   VecGetArray(zz,&z);

1968:   idx = a->j;
1969:   v   = a->a;
1970:   if (usecprow) {
1971:     mbs  = cprow.nrows;
1972:     ii   = cprow.i;
1973:     ridx = cprow.rindex;
1974:   } else {
1975:     mbs=a->mbs;
1976:     ii = a->i;
1977:     xb = x;
1978:   }

1980:   switch (bs) {
1981:   case 1:
1982:     for (i=0; i<mbs; i++) {
1983:       if (usecprow) xb = x + ridx[i];
1984:       x1 = xb[0];
1985:       ib = idx + ii[0];
1986:       n  = ii[1] - ii[0]; ii++;
1987:       for (j=0; j<n; j++) {
1988:         rval     = ib[j];
1989:         z[rval] += PetscConj(*v) * x1;
1990:         v++;
1991:       }
1992:       if (!usecprow) xb++;
1993:     }
1994:     break;
1995:   case 2:
1996:     for (i=0; i<mbs; i++) {
1997:       if (usecprow) xb = x + 2*ridx[i];
1998:       x1 = xb[0]; x2 = xb[1];
1999:       ib = idx + ii[0];
2000:       n  = ii[1] - ii[0]; ii++;
2001:       for (j=0; j<n; j++) {
2002:         rval       = ib[j]*2;
2003:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2004:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2005:         v         += 4;
2006:       }
2007:       if (!usecprow) xb += 2;
2008:     }
2009:     break;
2010:   case 3:
2011:     for (i=0; i<mbs; i++) {
2012:       if (usecprow) xb = x + 3*ridx[i];
2013:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2014:       ib = idx + ii[0];
2015:       n  = ii[1] - ii[0]; ii++;
2016:       for (j=0; j<n; j++) {
2017:         rval       = ib[j]*3;
2018:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2019:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2020:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2021:         v         += 9;
2022:       }
2023:       if (!usecprow) xb += 3;
2024:     }
2025:     break;
2026:   case 4:
2027:     for (i=0; i<mbs; i++) {
2028:       if (usecprow) xb = x + 4*ridx[i];
2029:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2030:       ib = idx + ii[0];
2031:       n  = ii[1] - ii[0]; ii++;
2032:       for (j=0; j<n; j++) {
2033:         rval       = ib[j]*4;
2034:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2035:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2036:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2037:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2038:         v         += 16;
2039:       }
2040:       if (!usecprow) xb += 4;
2041:     }
2042:     break;
2043:   case 5:
2044:     for (i=0; i<mbs; i++) {
2045:       if (usecprow) xb = x + 5*ridx[i];
2046:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2047:       x4 = xb[3]; x5 = xb[4];
2048:       ib = idx + ii[0];
2049:       n  = ii[1] - ii[0]; ii++;
2050:       for (j=0; j<n; j++) {
2051:         rval       = ib[j]*5;
2052:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2053:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2054:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2055:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2056:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2057:         v         += 25;
2058:       }
2059:       if (!usecprow) xb += 5;
2060:     }
2061:     break;
2062:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2063:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2064: #if 0
2065:     {
2066:       PetscInt          ncols,k,bs2=a->bs2;
2067:       PetscScalar       *work,*workt,zb;
2068:       const PetscScalar *xtmp;
2069:       if (!a->mult_work) {
2070:         k    = PetscMax(A->rmap->n,A->cmap->n);
2071:         PetscMalloc1(k+1,&a->mult_work);
2072:       }
2073:       work = a->mult_work;
2074:       xtmp = x;
2075:       for (i=0; i<mbs; i++) {
2076:         n     = ii[1] - ii[0]; ii++;
2077:         ncols = n*bs;
2078:         PetscMemzero(work,ncols*sizeof(PetscScalar));
2079:         if (usecprow) xtmp = x + bs*ridx[i];
2080:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2081:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2082:         v += n*bs2;
2083:         if (!usecprow) xtmp += bs;
2084:         workt = work;
2085:         for (j=0; j<n; j++) {
2086:           zb = z + bs*(*idx++);
2087:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2088:           workt += bs;
2089:         }
2090:       }
2091:     }
2092: #endif
2093:   }
2094:   VecRestoreArrayRead(xx,&x);
2095:   VecRestoreArray(zz,&z);
2096:   PetscLogFlops(2.0*a->nz*a->bs2);
2097:   return(0);
2098: }

2100: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2101: {
2102:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2103:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2104:   const PetscScalar *x,*xb = 0;
2105:   const MatScalar   *v;
2106:   PetscErrorCode    ierr;
2107:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2108:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2109:   Mat_CompressedRow cprow   = a->compressedrow;
2110:   PetscBool         usecprow=cprow.use;

2113:   if (yy != zz) { VecCopy(yy,zz); }
2114:   VecGetArrayRead(xx,&x);
2115:   VecGetArray(zz,&z);

2117:   idx = a->j;
2118:   v   = a->a;
2119:   if (usecprow) {
2120:     mbs  = cprow.nrows;
2121:     ii   = cprow.i;
2122:     ridx = cprow.rindex;
2123:   } else {
2124:     mbs=a->mbs;
2125:     ii = a->i;
2126:     xb = x;
2127:   }

2129:   switch (bs) {
2130:   case 1:
2131:     for (i=0; i<mbs; i++) {
2132:       if (usecprow) xb = x + ridx[i];
2133:       x1 = xb[0];
2134:       ib = idx + ii[0];
2135:       n  = ii[1] - ii[0]; ii++;
2136:       for (j=0; j<n; j++) {
2137:         rval     = ib[j];
2138:         z[rval] += *v * x1;
2139:         v++;
2140:       }
2141:       if (!usecprow) xb++;
2142:     }
2143:     break;
2144:   case 2:
2145:     for (i=0; i<mbs; i++) {
2146:       if (usecprow) xb = x + 2*ridx[i];
2147:       x1 = xb[0]; x2 = xb[1];
2148:       ib = idx + ii[0];
2149:       n  = ii[1] - ii[0]; ii++;
2150:       for (j=0; j<n; j++) {
2151:         rval       = ib[j]*2;
2152:         z[rval++] += v[0]*x1 + v[1]*x2;
2153:         z[rval++] += v[2]*x1 + v[3]*x2;
2154:         v         += 4;
2155:       }
2156:       if (!usecprow) xb += 2;
2157:     }
2158:     break;
2159:   case 3:
2160:     for (i=0; i<mbs; i++) {
2161:       if (usecprow) xb = x + 3*ridx[i];
2162:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2163:       ib = idx + ii[0];
2164:       n  = ii[1] - ii[0]; ii++;
2165:       for (j=0; j<n; j++) {
2166:         rval       = ib[j]*3;
2167:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2168:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2169:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2170:         v         += 9;
2171:       }
2172:       if (!usecprow) xb += 3;
2173:     }
2174:     break;
2175:   case 4:
2176:     for (i=0; i<mbs; i++) {
2177:       if (usecprow) xb = x + 4*ridx[i];
2178:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2179:       ib = idx + ii[0];
2180:       n  = ii[1] - ii[0]; ii++;
2181:       for (j=0; j<n; j++) {
2182:         rval       = ib[j]*4;
2183:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2184:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2185:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2186:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2187:         v         += 16;
2188:       }
2189:       if (!usecprow) xb += 4;
2190:     }
2191:     break;
2192:   case 5:
2193:     for (i=0; i<mbs; i++) {
2194:       if (usecprow) xb = x + 5*ridx[i];
2195:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2196:       x4 = xb[3]; x5 = xb[4];
2197:       ib = idx + ii[0];
2198:       n  = ii[1] - ii[0]; ii++;
2199:       for (j=0; j<n; j++) {
2200:         rval       = ib[j]*5;
2201:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2202:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2203:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2204:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2205:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2206:         v         += 25;
2207:       }
2208:       if (!usecprow) xb += 5;
2209:     }
2210:     break;
2211:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2212:     PetscInt          ncols,k;
2213:     PetscScalar       *work,*workt;
2214:     const PetscScalar *xtmp;
2215:     if (!a->mult_work) {
2216:       k    = PetscMax(A->rmap->n,A->cmap->n);
2217:       PetscMalloc1(k+1,&a->mult_work);
2218:     }
2219:     work = a->mult_work;
2220:     xtmp = x;
2221:     for (i=0; i<mbs; i++) {
2222:       n     = ii[1] - ii[0]; ii++;
2223:       ncols = n*bs;
2224:       PetscMemzero(work,ncols*sizeof(PetscScalar));
2225:       if (usecprow) xtmp = x + bs*ridx[i];
2226:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2227:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2228:       v += n*bs2;
2229:       if (!usecprow) xtmp += bs;
2230:       workt = work;
2231:       for (j=0; j<n; j++) {
2232:         zb = z + bs*(*idx++);
2233:         for (k=0; k<bs; k++) zb[k] += workt[k];
2234:         workt += bs;
2235:       }
2236:     }
2237:     }
2238:   }
2239:   VecRestoreArrayRead(xx,&x);
2240:   VecRestoreArray(zz,&z);
2241:   PetscLogFlops(2.0*a->nz*a->bs2);
2242:   return(0);
2243: }

2245: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2246: {
2247:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2248:   PetscInt       totalnz = a->bs2*a->nz;
2249:   PetscScalar    oalpha  = alpha;
2251:   PetscBLASInt   one = 1,tnz;

2254:   PetscBLASIntCast(totalnz,&tnz);
2255:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2256:   PetscLogFlops(totalnz);
2257:   return(0);
2258: }

2260: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2261: {
2263:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2264:   MatScalar      *v  = a->a;
2265:   PetscReal      sum = 0.0;
2266:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2269:   if (type == NORM_FROBENIUS) {
2270: #if defined(PETSC_USE_REAL___FP16)
2271:     PetscBLASInt one = 1,cnt = bs2*nz;
2272:     *norm = BLASnrm2_(&cnt,v,&one);
2273: #else
2274:     for (i=0; i<bs2*nz; i++) {
2275:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2276:     }
2277: #endif
2278:     *norm = PetscSqrtReal(sum);
2279:     PetscLogFlops(2*bs2*nz);
2280:   } else if (type == NORM_1) { /* maximum column sum */
2281:     PetscReal *tmp;
2282:     PetscInt  *bcol = a->j;
2283:     PetscCalloc1(A->cmap->n+1,&tmp);
2284:     for (i=0; i<nz; i++) {
2285:       for (j=0; j<bs; j++) {
2286:         k1 = bs*(*bcol) + j; /* column index */
2287:         for (k=0; k<bs; k++) {
2288:           tmp[k1] += PetscAbsScalar(*v); v++;
2289:         }
2290:       }
2291:       bcol++;
2292:     }
2293:     *norm = 0.0;
2294:     for (j=0; j<A->cmap->n; j++) {
2295:       if (tmp[j] > *norm) *norm = tmp[j];
2296:     }
2297:     PetscFree(tmp);
2298:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2299:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2300:     *norm = 0.0;
2301:     for (k=0; k<bs; k++) {
2302:       for (j=0; j<a->mbs; j++) {
2303:         v   = a->a + bs2*a->i[j] + k;
2304:         sum = 0.0;
2305:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2306:           for (k1=0; k1<bs; k1++) {
2307:             sum += PetscAbsScalar(*v);
2308:             v   += bs;
2309:           }
2310:         }
2311:         if (sum > *norm) *norm = sum;
2312:       }
2313:     }
2314:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2315:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2316:   return(0);
2317: }


2320: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2321: {
2322:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2326:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2327:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2328:     *flg = PETSC_FALSE;
2329:     return(0);
2330:   }

2332:   /* if the a->i are the same */
2333:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2334:   if (!*flg) return(0);

2336:   /* if a->j are the same */
2337:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2338:   if (!*flg) return(0);

2340:   /* if a->a are the same */
2341:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2342:   return(0);

2344: }

2346: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2347: {
2348:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2350:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2351:   PetscScalar    *x,zero = 0.0;
2352:   MatScalar      *aa,*aa_j;

2355:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2356:   bs   = A->rmap->bs;
2357:   aa   = a->a;
2358:   ai   = a->i;
2359:   aj   = a->j;
2360:   ambs = a->mbs;
2361:   bs2  = a->bs2;

2363:   VecSet(v,zero);
2364:   VecGetArray(v,&x);
2365:   VecGetLocalSize(v,&n);
2366:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2367:   for (i=0; i<ambs; i++) {
2368:     for (j=ai[i]; j<ai[i+1]; j++) {
2369:       if (aj[j] == i) {
2370:         row  = i*bs;
2371:         aa_j = aa+j*bs2;
2372:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2373:         break;
2374:       }
2375:     }
2376:   }
2377:   VecRestoreArray(v,&x);
2378:   return(0);
2379: }

2381: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2382: {
2383:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2384:   const PetscScalar *l,*r,*li,*ri;
2385:   PetscScalar       x;
2386:   MatScalar         *aa, *v;
2387:   PetscErrorCode    ierr;
2388:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2389:   const PetscInt    *ai,*aj;

2392:   ai  = a->i;
2393:   aj  = a->j;
2394:   aa  = a->a;
2395:   m   = A->rmap->n;
2396:   n   = A->cmap->n;
2397:   bs  = A->rmap->bs;
2398:   mbs = a->mbs;
2399:   bs2 = a->bs2;
2400:   if (ll) {
2401:     VecGetArrayRead(ll,&l);
2402:     VecGetLocalSize(ll,&lm);
2403:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2404:     for (i=0; i<mbs; i++) { /* for each block row */
2405:       M  = ai[i+1] - ai[i];
2406:       li = l + i*bs;
2407:       v  = aa + bs2*ai[i];
2408:       for (j=0; j<M; j++) { /* for each block */
2409:         for (k=0; k<bs2; k++) {
2410:           (*v++) *= li[k%bs];
2411:         }
2412:       }
2413:     }
2414:     VecRestoreArrayRead(ll,&l);
2415:     PetscLogFlops(a->nz);
2416:   }

2418:   if (rr) {
2419:     VecGetArrayRead(rr,&r);
2420:     VecGetLocalSize(rr,&rn);
2421:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2422:     for (i=0; i<mbs; i++) { /* for each block row */
2423:       iai = ai[i];
2424:       M   = ai[i+1] - iai;
2425:       v   = aa + bs2*iai;
2426:       for (j=0; j<M; j++) { /* for each block */
2427:         ri = r + bs*aj[iai+j];
2428:         for (k=0; k<bs; k++) {
2429:           x = ri[k];
2430:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2431:           v += bs;
2432:         }
2433:       }
2434:     }
2435:     VecRestoreArrayRead(rr,&r);
2436:     PetscLogFlops(a->nz);
2437:   }
2438:   return(0);
2439: }


2442: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2443: {
2444:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2447:   info->block_size   = a->bs2;
2448:   info->nz_allocated = a->bs2*a->maxnz;
2449:   info->nz_used      = a->bs2*a->nz;
2450:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2451:   info->assemblies   = A->num_ass;
2452:   info->mallocs      = A->info.mallocs;
2453:   info->memory       = ((PetscObject)A)->mem;
2454:   if (A->factortype) {
2455:     info->fill_ratio_given  = A->info.fill_ratio_given;
2456:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2457:     info->factor_mallocs    = A->info.factor_mallocs;
2458:   } else {
2459:     info->fill_ratio_given  = 0;
2460:     info->fill_ratio_needed = 0;
2461:     info->factor_mallocs    = 0;
2462:   }
2463:   return(0);
2464: }

2466: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2467: {
2468:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2472:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2473:   return(0);
2474: }