Actual source code: sbaij2.c

petsc-3.13.6 2020-09-29
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <../src/mat/impls/dense/seq/dense.h>
  4:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5:  #include <petsc/private/kernels/blockinvert.h>
  6:  #include <petscbt.h>
  7:  #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 13:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 14:   const PetscInt *idx;
 15:   PetscBT        table_out,table_in;

 18:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 19:   mbs  = a->mbs;
 20:   ai   = a->i;
 21:   aj   = a->j;
 22:   bs   = A->rmap->bs;
 23:   PetscBTCreate(mbs,&table_out);
 24:   PetscMalloc1(mbs+1,&nidx);
 25:   PetscMalloc1(A->rmap->N+1,&nidx2);
 26:   PetscBTCreate(mbs,&table_in);

 28:   for (i=0; i<is_max; i++) { /* for each is */
 29:     isz  = 0;
 30:     PetscBTMemzero(mbs,table_out);

 32:     /* Extract the indices, assume there can be duplicate entries */
 33:     ISGetIndices(is[i],&idx);
 34:     ISGetLocalSize(is[i],&n);

 36:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 37:     bcol_max = 0;
 38:     for (j=0; j<n; ++j) {
 39:       brow = idx[j]/bs; /* convert the indices into block indices */
 40:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 41:       if (!PetscBTLookupSet(table_out,brow)) {
 42:         nidx[isz++] = brow;
 43:         if (bcol_max < brow) bcol_max = brow;
 44:       }
 45:     }
 46:     ISRestoreIndices(is[i],&idx);
 47:     ISDestroy(&is[i]);

 49:     k = 0;
 50:     for (j=0; j<ov; j++) { /* for each overlap */
 51:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 52:       PetscBTMemzero(mbs,table_in);
 53:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 55:       n = isz;  /* length of the updated is[i] */
 56:       for (brow=0; brow<mbs; brow++) {
 57:         start = ai[brow]; end   = ai[brow+1];
 58:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 59:           for (l = start; l<end; l++) {
 60:             bcol = aj[l];
 61:             if (!PetscBTLookupSet(table_out,bcol)) {
 62:               nidx[isz++] = bcol;
 63:               if (bcol_max < bcol) bcol_max = bcol;
 64:             }
 65:           }
 66:           k++;
 67:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 68:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 69:           for (l = start; l<end; l++) {
 70:             bcol = aj[l];
 71:             if (bcol > bcol_max) break;
 72:             if (PetscBTLookup(table_in,bcol)) {
 73:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 74:               break; /* for l = start; l<end ; l++) */
 75:             }
 76:           }
 77:         }
 78:       }
 79:     } /* for each overlap */

 81:     /* expand the Index Set */
 82:     for (j=0; j<isz; j++) {
 83:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 84:     }
 85:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 86:   }
 87:   PetscBTDestroy(&table_out);
 88:   PetscFree(nidx);
 89:   PetscFree(nidx2);
 90:   PetscBTDestroy(&table_in);
 91:   return(0);
 92: }

 94: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 95:         Zero some ops' to avoid invalid usse */
 96: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 97: {

101:   MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
102:   Bseq->ops->mult                   = 0;
103:   Bseq->ops->multadd                = 0;
104:   Bseq->ops->multtranspose          = 0;
105:   Bseq->ops->multtransposeadd       = 0;
106:   Bseq->ops->lufactor               = 0;
107:   Bseq->ops->choleskyfactor         = 0;
108:   Bseq->ops->lufactorsymbolic       = 0;
109:   Bseq->ops->choleskyfactorsymbolic = 0;
110:   Bseq->ops->getinertia             = 0;
111:   return(0);
112: }

114: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
115: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116: {
117:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
119:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121:   const PetscInt *irow,*icol;
122:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123:   PetscInt       *aj = a->j,*ai = a->i;
124:   MatScalar      *mat_a;
125:   Mat            C;
126:   PetscBool      flag;


130:   ISGetIndices(isrow,&irow);
131:   ISGetIndices(iscol,&icol);
132:   ISGetLocalSize(isrow,&nrows);
133:   ISGetLocalSize(iscol,&ncols);

135:   PetscCalloc1(1+oldcols,&smap);
136:   ssmap = smap;
137:   PetscMalloc1(1+nrows,&lens);
138:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139:   /* determine lens of each row */
140:   for (i=0; i<nrows; i++) {
141:     kstart  = ai[irow[i]];
142:     kend    = kstart + a->ilen[irow[i]];
143:     lens[i] = 0;
144:     for (k=kstart; k<kend; k++) {
145:       if (ssmap[aj[k]]) lens[i]++;
146:     }
147:   }
148:   /* Create and fill new matrix */
149:   if (scall == MAT_REUSE_MATRIX) {
150:     c = (Mat_SeqSBAIJ*)((*B)->data);

152:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
154:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155:     PetscArrayzero(c->ilen,c->mbs);
156:     C    = *B;
157:   } else {
158:     MatCreate(PetscObjectComm((PetscObject)A),&C);
159:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
160:     MatSetType(C,((PetscObject)A)->type_name);
161:     MatSeqSBAIJSetPreallocation(C,bs,0,lens);
162:   }
163:   c = (Mat_SeqSBAIJ*)(C->data);
164:   for (i=0; i<nrows; i++) {
165:     row      = irow[i];
166:     kstart   = ai[row];
167:     kend     = kstart + a->ilen[row];
168:     mat_i    = c->i[i];
169:     mat_j    = c->j + mat_i;
170:     mat_a    = c->a + mat_i*bs2;
171:     mat_ilen = c->ilen + i;
172:     for (k=kstart; k<kend; k++) {
173:       if ((tcol=ssmap[a->j[k]])) {
174:         *mat_j++ = tcol - 1;
175:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
176:         mat_a   += bs2;
177:         (*mat_ilen)++;
178:       }
179:     }
180:   }
181:   /* sort */
182:   {
183:     MatScalar *work;

185:     PetscMalloc1(bs2,&work);
186:     for (i=0; i<nrows; i++) {
187:       PetscInt ilen;
188:       mat_i = c->i[i];
189:       mat_j = c->j + mat_i;
190:       mat_a = c->a + mat_i*bs2;
191:       ilen  = c->ilen[i];
192:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
193:     }
194:     PetscFree(work);
195:   }

197:   /* Free work space */
198:   ISRestoreIndices(iscol,&icol);
199:   PetscFree(smap);
200:   PetscFree(lens);
201:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
202:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

204:   ISRestoreIndices(isrow,&irow);
205:   *B   = C;
206:   return(0);
207: }

209: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210: {
211:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212:   IS             is1,is2;
214:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215:   const PetscInt *irow,*icol;

218:   ISGetIndices(isrow,&irow);
219:   ISGetIndices(iscol,&icol);
220:   ISGetLocalSize(isrow,&nrows);
221:   ISGetLocalSize(iscol,&ncols);

223:   /* Verify if the indices corespond to each element in a block
224:    and form the IS with compressed IS */
225:   maxmnbs = PetscMax(a->mbs,a->nbs);
226:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
227:   PetscArrayzero(vary,a->mbs);
228:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229:   for (i=0; i<a->mbs; i++) {
230:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231:   }
232:   count = 0;
233:   for (i=0; i<nrows; i++) {
234:     PetscInt j = irow[i] / bs;
235:     if ((vary[j]--)==bs) iary[count++] = j;
236:   }
237:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

239:   PetscArrayzero(vary,a->nbs);
240:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241:   for (i=0; i<a->nbs; i++) {
242:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243:   }
244:   count = 0;
245:   for (i=0; i<ncols; i++) {
246:     PetscInt j = icol[i] / bs;
247:     if ((vary[j]--)==bs) iary[count++] = j;
248:   }
249:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
250:   ISRestoreIndices(isrow,&irow);
251:   ISRestoreIndices(iscol,&icol);
252:   PetscFree2(vary,iary);

254:   MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
255:   ISDestroy(&is1);
256:   ISDestroy(&is2);

258:   if (isrow != iscol) {
259:     PetscBool isequal;
260:     ISEqual(isrow,iscol,&isequal);
261:     if (!isequal) {
262:       MatSeqSBAIJZeroOps_Private(*B);
263:     }
264:   }
265:   return(0);
266: }

268: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269: {
271:   PetscInt       i;

274:   if (scall == MAT_INITIAL_MATRIX) {
275:     PetscCalloc1(n+1,B);
276:   }

278:   for (i=0; i<n; i++) {
279:     MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
280:   }
281:   return(0);
282: }

284: /* -------------------------------------------------------*/
285: /* Should check that shapes of vectors and matrices match */
286: /* -------------------------------------------------------*/

288: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289: {
290:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291:   PetscScalar       *z,x1,x2,zero=0.0;
292:   const PetscScalar *x,*xb;
293:   const MatScalar   *v;
294:   PetscErrorCode    ierr;
295:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
297:   PetscInt          nonzerorow=0;

300:   VecSet(zz,zero);
301:   if (!a->nz) return(0);
302:   VecGetArrayRead(xx,&x);
303:   VecGetArray(zz,&z);

305:   v  = a->a;
306:   xb = x;

308:   for (i=0; i<mbs; i++) {
309:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
310:     x1          = xb[0]; x2 = xb[1];
311:     ib          = aj + *ai;
312:     jmin        = 0;
313:     nonzerorow += (n>0);
314:     if (*ib == i) {     /* (diag of A)*x */
315:       z[2*i]   += v[0]*x1 + v[2]*x2;
316:       z[2*i+1] += v[2]*x1 + v[3]*x2;
317:       v        += 4; jmin++;
318:     }
319:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321:     for (j=jmin; j<n; j++) {
322:       /* (strict lower triangular part of A)*x  */
323:       cval       = ib[j]*2;
324:       z[cval]   += v[0]*x1 + v[1]*x2;
325:       z[cval+1] += v[2]*x1 + v[3]*x2;
326:       /* (strict upper triangular part of A)*x  */
327:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
328:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329:       v        += 4;
330:     }
331:     xb +=2; ai++;
332:   }

334:   VecRestoreArrayRead(xx,&x);
335:   VecRestoreArray(zz,&z);
336:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
337:   return(0);
338: }

340: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341: {
342:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343:   PetscScalar       *z,x1,x2,x3,zero=0.0;
344:   const PetscScalar *x,*xb;
345:   const MatScalar   *v;
346:   PetscErrorCode    ierr;
347:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
349:   PetscInt          nonzerorow=0;

352:   VecSet(zz,zero);
353:   if (!a->nz) return(0);
354:   VecGetArrayRead(xx,&x);
355:   VecGetArray(zz,&z);

357:   v  = a->a;
358:   xb = x;

360:   for (i=0; i<mbs; i++) {
361:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
362:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
363:     ib          = aj + *ai;
364:     jmin        = 0;
365:     nonzerorow += (n>0);
366:     if (*ib == i) {     /* (diag of A)*x */
367:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
368:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370:       v        += 9; jmin++;
371:     }
372:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374:     for (j=jmin; j<n; j++) {
375:       /* (strict lower triangular part of A)*x  */
376:       cval       = ib[j]*3;
377:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
378:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380:       /* (strict upper triangular part of A)*x  */
381:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384:       v        += 9;
385:     }
386:     xb +=3; ai++;
387:   }

389:   VecRestoreArrayRead(xx,&x);
390:   VecRestoreArray(zz,&z);
391:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
392:   return(0);
393: }

395: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396: {
397:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398:   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399:   const PetscScalar *x,*xb;
400:   const MatScalar   *v;
401:   PetscErrorCode    ierr;
402:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
404:   PetscInt          nonzerorow = 0;

407:   VecSet(zz,zero);
408:   if (!a->nz) return(0);
409:   VecGetArrayRead(xx,&x);
410:   VecGetArray(zz,&z);

412:   v  = a->a;
413:   xb = x;

415:   for (i=0; i<mbs; i++) {
416:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418:     ib          = aj + *ai;
419:     jmin        = 0;
420:     nonzerorow += (n>0);
421:     if (*ib == i) {     /* (diag of A)*x */
422:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
423:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
424:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426:       v        += 16; jmin++;
427:     }
428:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430:     for (j=jmin; j<n; j++) {
431:       /* (strict lower triangular part of A)*x  */
432:       cval       = ib[j]*4;
433:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437:       /* (strict upper triangular part of A)*x  */
438:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442:       v        += 16;
443:     }
444:     xb +=4; ai++;
445:   }

447:   VecRestoreArrayRead(xx,&x);
448:   VecRestoreArray(zz,&z);
449:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
450:   return(0);
451: }

453: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454: {
455:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456:   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457:   const PetscScalar *x,*xb;
458:   const MatScalar   *v;
459:   PetscErrorCode    ierr;
460:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
462:   PetscInt          nonzerorow=0;

465:   VecSet(zz,zero);
466:   if (!a->nz) return(0);
467:   VecGetArrayRead(xx,&x);
468:   VecGetArray(zz,&z);

470:   v  = a->a;
471:   xb = x;

473:   for (i=0; i<mbs; i++) {
474:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
475:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476:     ib          = aj + *ai;
477:     jmin        = 0;
478:     nonzerorow += (n>0);
479:     if (*ib == i) {      /* (diag of A)*x */
480:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485:       v        += 25; jmin++;
486:     }
487:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489:     for (j=jmin; j<n; j++) {
490:       /* (strict lower triangular part of A)*x  */
491:       cval       = ib[j]*5;
492:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497:       /* (strict upper triangular part of A)*x  */
498:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503:       v        += 25;
504:     }
505:     xb +=5; ai++;
506:   }

508:   VecRestoreArrayRead(xx,&x);
509:   VecRestoreArray(zz,&z);
510:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
511:   return(0);
512: }

514: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518:   const PetscScalar *x,*xb;
519:   const MatScalar   *v;
520:   PetscErrorCode    ierr;
521:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523:   PetscInt          nonzerorow=0;

526:   VecSet(zz,zero);
527:   if (!a->nz) return(0);
528:   VecGetArrayRead(xx,&x);
529:   VecGetArray(zz,&z);

531:   v  = a->a;
532:   xb = x;

534:   for (i=0; i<mbs; i++) {
535:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537:     ib          = aj + *ai;
538:     jmin        = 0;
539:     nonzerorow += (n>0);
540:     if (*ib == i) {      /* (diag of A)*x */
541:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547:       v        += 36; jmin++;
548:     }
549:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551:     for (j=jmin; j<n; j++) {
552:       /* (strict lower triangular part of A)*x  */
553:       cval       = ib[j]*6;
554:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560:       /* (strict upper triangular part of A)*x  */
561:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567:       v        += 36;
568:     }
569:     xb +=6; ai++;
570:   }

572:   VecRestoreArrayRead(xx,&x);
573:   VecRestoreArray(zz,&z);
574:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
575:   return(0);
576: }

578: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579: {
580:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582:   const PetscScalar *x,*xb;
583:   const MatScalar   *v;
584:   PetscErrorCode    ierr;
585:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
587:   PetscInt          nonzerorow=0;

590:   VecSet(zz,zero);
591:   if (!a->nz) return(0);
592:   VecGetArrayRead(xx,&x);
593:   VecGetArray(zz,&z);

595:   v  = a->a;
596:   xb = x;

598:   for (i=0; i<mbs; i++) {
599:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
600:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601:     ib          = aj + *ai;
602:     jmin        = 0;
603:     nonzerorow += (n>0);
604:     if (*ib == i) {      /* (diag of A)*x */
605:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612:       v        += 49; jmin++;
613:     }
614:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616:     for (j=jmin; j<n; j++) {
617:       /* (strict lower triangular part of A)*x  */
618:       cval       = ib[j]*7;
619:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626:       /* (strict upper triangular part of A)*x  */
627:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634:       v       += 49;
635:     }
636:     xb +=7; ai++;
637:   }
638:   VecRestoreArrayRead(xx,&x);
639:   VecRestoreArray(zz,&z);
640:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
641:   return(0);
642: }

644: /*
645:     This will not work with MatScalar == float because it calls the BLAS
646: */
647: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648: {
649:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650:   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651:   const PetscScalar *x,*x_ptr,*xb;
652:   const MatScalar   *v;
653:   PetscErrorCode    ierr;
654:   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655:   const PetscInt    *idx,*aj,*ii;
656:   PetscInt          nonzerorow=0;

659:   VecSet(zz,zero);
660:   if (!a->nz) return(0);
661:   VecGetArrayRead(xx,&x);x_ptr = x;
662:   VecGetArray(zz,&z); z_ptr=z;

664:   aj = a->j;
665:   v  = a->a;
666:   ii = a->i;

668:   if (!a->mult_work) {
669:     PetscMalloc1(A->rmap->N+1,&a->mult_work);
670:   }
671:   work = a->mult_work;

673:   for (i=0; i<mbs; i++) {
674:     n           = ii[1] - ii[0]; ncols = n*bs;
675:     workt       = work; idx=aj+ii[0];
676:     nonzerorow += (n>0);

678:     /* upper triangular part */
679:     for (j=0; j<n; j++) {
680:       xb = x_ptr + bs*(*idx++);
681:       for (k=0; k<bs; k++) workt[k] = xb[k];
682:       workt += bs;
683:     }
684:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
685:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

687:     /* strict lower triangular part */
688:     idx = aj+ii[0];
689:     if (*idx == i) {
690:       ncols -= bs; v += bs2; idx++; n--;
691:     }

693:     if (ncols > 0) {
694:       workt = work;
695:       PetscArrayzero(workt,ncols);
696:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
697:       for (j=0; j<n; j++) {
698:         zb = z_ptr + bs*(*idx++);
699:         for (k=0; k<bs; k++) zb[k] += workt[k];
700:         workt += bs;
701:       }
702:     }
703:     x += bs; v += n*bs2; z += bs; ii++;
704:   }

706:   VecRestoreArrayRead(xx,&x);
707:   VecRestoreArray(zz,&z);
708:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
709:   return(0);
710: }

712: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
713: {
714:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
715:   PetscScalar       *z,x1;
716:   const PetscScalar *x,*xb;
717:   const MatScalar   *v;
718:   PetscErrorCode    ierr;
719:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
720:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
721:   PetscInt          nonzerorow=0;
722: #if defined(PETSC_USE_COMPLEX)
723:   const int         aconj = A->hermitian;
724: #else
725:   const int         aconj = 0;
726: #endif

729:   VecCopy(yy,zz);
730:   if (!a->nz) return(0);
731:   VecGetArrayRead(xx,&x);
732:   VecGetArray(zz,&z);
733:   v    = a->a;
734:   xb   = x;

736:   for (i=0; i<mbs; i++) {
737:     n           = ai[1] - ai[0]; /* length of i_th row of A */
738:     x1          = xb[0];
739:     ib          = aj + *ai;
740:     jmin        = 0;
741:     nonzerorow += (n>0);
742:     if (n && *ib == i) {            /* (diag of A)*x */
743:       z[i] += *v++ * x[*ib++]; jmin++;
744:     }
745:     if (aconj) {
746:       for (j=jmin; j<n; j++) {
747:         cval    = *ib;
748:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
749:         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
750:       }
751:     } else {
752:       for (j=jmin; j<n; j++) {
753:         cval    = *ib;
754:         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
755:         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
756:       }
757:     }
758:     xb++; ai++;
759:   }

761:   VecRestoreArrayRead(xx,&x);
762:   VecRestoreArray(zz,&z);

764:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
765:   return(0);
766: }

768: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
769: {
770:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
771:   PetscScalar       *z,x1,x2;
772:   const PetscScalar *x,*xb;
773:   const MatScalar   *v;
774:   PetscErrorCode    ierr;
775:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
776:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
777:   PetscInt          nonzerorow=0;

780:   VecCopy(yy,zz);
781:   if (!a->nz) return(0);
782:   VecGetArrayRead(xx,&x);
783:   VecGetArray(zz,&z);

785:   v  = a->a;
786:   xb = x;

788:   for (i=0; i<mbs; i++) {
789:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
790:     x1          = xb[0]; x2 = xb[1];
791:     ib          = aj + *ai;
792:     jmin        = 0;
793:     nonzerorow += (n>0);
794:     if (*ib == i) {      /* (diag of A)*x */
795:       z[2*i]   += v[0]*x1 + v[2]*x2;
796:       z[2*i+1] += v[2]*x1 + v[3]*x2;
797:       v        += 4; jmin++;
798:     }
799:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
801:     for (j=jmin; j<n; j++) {
802:       /* (strict lower triangular part of A)*x  */
803:       cval       = ib[j]*2;
804:       z[cval]   += v[0]*x1 + v[1]*x2;
805:       z[cval+1] += v[2]*x1 + v[3]*x2;
806:       /* (strict upper triangular part of A)*x  */
807:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
808:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
809:       v        += 4;
810:     }
811:     xb +=2; ai++;
812:   }
813:   VecRestoreArrayRead(xx,&x);
814:   VecRestoreArray(zz,&z);

816:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
817:   return(0);
818: }

820: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
821: {
822:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
823:   PetscScalar       *z,x1,x2,x3;
824:   const PetscScalar *x,*xb;
825:   const MatScalar   *v;
826:   PetscErrorCode    ierr;
827:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
828:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
829:   PetscInt          nonzerorow=0;

832:   VecCopy(yy,zz);
833:   if (!a->nz) return(0);
834:   VecGetArrayRead(xx,&x);
835:   VecGetArray(zz,&z);

837:   v  = a->a;
838:   xb = x;

840:   for (i=0; i<mbs; i++) {
841:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
842:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
843:     ib          = aj + *ai;
844:     jmin        = 0;
845:     nonzerorow += (n>0);
846:     if (*ib == i) {     /* (diag of A)*x */
847:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
848:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
849:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
850:       v        += 9; jmin++;
851:     }
852:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
853:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
854:     for (j=jmin; j<n; j++) {
855:       /* (strict lower triangular part of A)*x  */
856:       cval       = ib[j]*3;
857:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
858:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
859:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
860:       /* (strict upper triangular part of A)*x  */
861:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
862:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
863:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
864:       v        += 9;
865:     }
866:     xb +=3; ai++;
867:   }

869:   VecRestoreArrayRead(xx,&x);
870:   VecRestoreArray(zz,&z);

872:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
873:   return(0);
874: }

876: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
877: {
878:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
879:   PetscScalar       *z,x1,x2,x3,x4;
880:   const PetscScalar *x,*xb;
881:   const MatScalar   *v;
882:   PetscErrorCode    ierr;
883:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
884:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
885:   PetscInt          nonzerorow=0;

888:   VecCopy(yy,zz);
889:   if (!a->nz) return(0);
890:   VecGetArrayRead(xx,&x);
891:   VecGetArray(zz,&z);

893:   v  = a->a;
894:   xb = x;

896:   for (i=0; i<mbs; i++) {
897:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
898:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
899:     ib          = aj + *ai;
900:     jmin        = 0;
901:     nonzerorow += (n>0);
902:     if (*ib == i) {      /* (diag of A)*x */
903:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
904:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
905:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
906:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
907:       v        += 16; jmin++;
908:     }
909:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
910:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
911:     for (j=jmin; j<n; j++) {
912:       /* (strict lower triangular part of A)*x  */
913:       cval       = ib[j]*4;
914:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
915:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
916:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
917:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
918:       /* (strict upper triangular part of A)*x  */
919:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
920:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
921:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
922:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
923:       v        += 16;
924:     }
925:     xb +=4; ai++;
926:   }

928:   VecRestoreArrayRead(xx,&x);
929:   VecRestoreArray(zz,&z);

931:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
932:   return(0);
933: }

935: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
936: {
937:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
938:   PetscScalar       *z,x1,x2,x3,x4,x5;
939:   const PetscScalar *x,*xb;
940:   const MatScalar   *v;
941:   PetscErrorCode    ierr;
942:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
943:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
944:   PetscInt          nonzerorow=0;

947:   VecCopy(yy,zz);
948:   if (!a->nz) return(0);
949:   VecGetArrayRead(xx,&x);
950:   VecGetArray(zz,&z);

952:   v  = a->a;
953:   xb = x;

955:   for (i=0; i<mbs; i++) {
956:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
957:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
958:     ib          = aj + *ai;
959:     jmin        = 0;
960:     nonzerorow += (n>0);
961:     if (*ib == i) {      /* (diag of A)*x */
962:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
963:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
964:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
965:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
966:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
967:       v        += 25; jmin++;
968:     }
969:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
970:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971:     for (j=jmin; j<n; j++) {
972:       /* (strict lower triangular part of A)*x  */
973:       cval       = ib[j]*5;
974:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
975:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
976:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
977:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
978:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
979:       /* (strict upper triangular part of A)*x  */
980:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
981:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
982:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
983:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
984:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
985:       v        += 25;
986:     }
987:     xb +=5; ai++;
988:   }

990:   VecRestoreArrayRead(xx,&x);
991:   VecRestoreArray(zz,&z);

993:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
994:   return(0);
995: }

997: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
998: {
999:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1000:   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1001:   const PetscScalar *x,*xb;
1002:   const MatScalar   *v;
1003:   PetscErrorCode    ierr;
1004:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1005:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1006:   PetscInt          nonzerorow=0;

1009:   VecCopy(yy,zz);
1010:   if (!a->nz) return(0);
1011:   VecGetArrayRead(xx,&x);
1012:   VecGetArray(zz,&z);

1014:   v  = a->a;
1015:   xb = x;

1017:   for (i=0; i<mbs; i++) {
1018:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1019:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1020:     ib          = aj + *ai;
1021:     jmin        = 0;
1022:     nonzerorow += (n>0);
1023:     if (*ib == i) {     /* (diag of A)*x */
1024:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1025:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1026:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1027:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1028:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1029:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1030:       v        += 36; jmin++;
1031:     }
1032:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1033:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034:     for (j=jmin; j<n; j++) {
1035:       /* (strict lower triangular part of A)*x  */
1036:       cval       = ib[j]*6;
1037:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1038:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1039:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1040:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1041:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1042:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1043:       /* (strict upper triangular part of A)*x  */
1044:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1045:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1046:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1047:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1048:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1049:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1050:       v        += 36;
1051:     }
1052:     xb +=6; ai++;
1053:   }

1055:   VecRestoreArrayRead(xx,&x);
1056:   VecRestoreArray(zz,&z);

1058:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1059:   return(0);
1060: }

1062: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1063: {
1064:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1065:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1066:   const PetscScalar *x,*xb;
1067:   const MatScalar   *v;
1068:   PetscErrorCode    ierr;
1069:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1070:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1071:   PetscInt          nonzerorow=0;

1074:   VecCopy(yy,zz);
1075:   if (!a->nz) return(0);
1076:   VecGetArrayRead(xx,&x);
1077:   VecGetArray(zz,&z);

1079:   v  = a->a;
1080:   xb = x;

1082:   for (i=0; i<mbs; i++) {
1083:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1084:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1085:     ib          = aj + *ai;
1086:     jmin        = 0;
1087:     nonzerorow += (n>0);
1088:     if (*ib == i) {     /* (diag of A)*x */
1089:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1090:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1091:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1092:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1093:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1094:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1095:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1096:       v        += 49; jmin++;
1097:     }
1098:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1099:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1100:     for (j=jmin; j<n; j++) {
1101:       /* (strict lower triangular part of A)*x  */
1102:       cval       = ib[j]*7;
1103:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1104:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1105:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1106:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1107:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1108:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1109:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1110:       /* (strict upper triangular part of A)*x  */
1111:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1112:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1113:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1114:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1115:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1116:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1117:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1118:       v       += 49;
1119:     }
1120:     xb +=7; ai++;
1121:   }

1123:   VecRestoreArrayRead(xx,&x);
1124:   VecRestoreArray(zz,&z);

1126:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1127:   return(0);
1128: }

1130: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1131: {
1132:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1133:   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1134:   const PetscScalar *x,*x_ptr,*xb;
1135:   const MatScalar   *v;
1136:   PetscErrorCode    ierr;
1137:   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1138:   const PetscInt    *idx,*aj,*ii;
1139:   PetscInt          nonzerorow=0;

1142:   VecCopy(yy,zz);
1143:   if (!a->nz) return(0);
1144:   VecGetArrayRead(xx,&x); x_ptr=x;
1145:   VecGetArray(zz,&z); z_ptr=z;

1147:   aj = a->j;
1148:   v  = a->a;
1149:   ii = a->i;

1151:   if (!a->mult_work) {
1152:     PetscMalloc1(A->rmap->n+1,&a->mult_work);
1153:   }
1154:   work = a->mult_work;


1157:   for (i=0; i<mbs; i++) {
1158:     n           = ii[1] - ii[0]; ncols = n*bs;
1159:     workt       = work; idx=aj+ii[0];
1160:     nonzerorow += (n>0);

1162:     /* upper triangular part */
1163:     for (j=0; j<n; j++) {
1164:       xb = x_ptr + bs*(*idx++);
1165:       for (k=0; k<bs; k++) workt[k] = xb[k];
1166:       workt += bs;
1167:     }
1168:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1169:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1171:     /* strict lower triangular part */
1172:     idx = aj+ii[0];
1173:     if (*idx == i) {
1174:       ncols -= bs; v += bs2; idx++; n--;
1175:     }
1176:     if (ncols > 0) {
1177:       workt = work;
1178:       PetscArrayzero(workt,ncols);
1179:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1180:       for (j=0; j<n; j++) {
1181:         zb = z_ptr + bs*(*idx++);
1182:         for (k=0; k<bs; k++) zb[k] += workt[k];
1183:         workt += bs;
1184:       }
1185:     }

1187:     x += bs; v += n*bs2; z += bs; ii++;
1188:   }

1190:   VecRestoreArrayRead(xx,&x);
1191:   VecRestoreArray(zz,&z);

1193:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1194:   return(0);
1195: }

1197: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1198: {
1199:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1200:   PetscScalar    oalpha = alpha;
1202:   PetscBLASInt   one = 1,totalnz;

1205:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1206:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1207:   PetscLogFlops(totalnz);
1208:   return(0);
1209: }

1211: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1212: {
1213:   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1214:   const MatScalar *v       = a->a;
1215:   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1216:   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1217:   PetscErrorCode  ierr;
1218:   const PetscInt  *aj=a->j,*col;

1221:   if (!a->nz) {
1222:     *norm = 0.0;
1223:     return(0);
1224:   }
1225:   if (type == NORM_FROBENIUS) {
1226:     for (k=0; k<mbs; k++) {
1227:       jmin = a->i[k]; jmax = a->i[k+1];
1228:       col  = aj + jmin;
1229:       if (*col == k) {         /* diagonal block */
1230:         for (i=0; i<bs2; i++) {
1231:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1232:         }
1233:         jmin++;
1234:       }
1235:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1236:         for (i=0; i<bs2; i++) {
1237:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1238:         }
1239:       }
1240:     }
1241:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1242:     PetscLogFlops(2.0*bs2*a->nz);
1243:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1244:     PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1245:     for (i=0; i<mbs; i++) jl[i] = mbs;
1246:     il[0] = 0;

1248:     *norm = 0.0;
1249:     for (k=0; k<mbs; k++) { /* k_th block row */
1250:       for (j=0; j<bs; j++) sum[j]=0.0;
1251:       /*-- col sum --*/
1252:       i = jl[k]; /* first |A(i,k)| to be added */
1253:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1254:                   at step k */
1255:       while (i<mbs) {
1256:         nexti = jl[i];  /* next block row to be added */
1257:         ik    = il[i];  /* block index of A(i,k) in the array a */
1258:         for (j=0; j<bs; j++) {
1259:           v = a->a + ik*bs2 + j*bs;
1260:           for (k1=0; k1<bs; k1++) {
1261:             sum[j] += PetscAbsScalar(*v); v++;
1262:           }
1263:         }
1264:         /* update il, jl */
1265:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1266:         jmax = a->i[i+1];
1267:         if (jmin < jmax) {
1268:           il[i] = jmin;
1269:           j     = a->j[jmin];
1270:           jl[i] = jl[j]; jl[j]=i;
1271:         }
1272:         i = nexti;
1273:       }
1274:       /*-- row sum --*/
1275:       jmin = a->i[k]; jmax = a->i[k+1];
1276:       for (i=jmin; i<jmax; i++) {
1277:         for (j=0; j<bs; j++) {
1278:           v = a->a + i*bs2 + j;
1279:           for (k1=0; k1<bs; k1++) {
1280:             sum[j] += PetscAbsScalar(*v); v += bs;
1281:           }
1282:         }
1283:       }
1284:       /* add k_th block row to il, jl */
1285:       col = aj+jmin;
1286:       if (*col == k) jmin++;
1287:       if (jmin < jmax) {
1288:         il[k] = jmin;
1289:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1290:       }
1291:       for (j=0; j<bs; j++) {
1292:         if (sum[j] > *norm) *norm = sum[j];
1293:       }
1294:     }
1295:     PetscFree3(sum,il,jl);
1296:     PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1297:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1298:   return(0);
1299: }

1301: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1302: {
1303:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1307:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1308:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1309:     *flg = PETSC_FALSE;
1310:     return(0);
1311:   }

1313:   /* if the a->i are the same */
1314:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1315:   if (!*flg) return(0);

1317:   /* if a->j are the same */
1318:   PetscArraycmp(a->j,b->j,a->nz,flg);
1319:   if (!*flg) return(0);

1321:   /* if a->a are the same */
1322:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1323:   return(0);
1324: }

1326: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1327: {
1328:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1329:   PetscErrorCode  ierr;
1330:   PetscInt        i,j,k,row,bs,ambs,bs2;
1331:   const PetscInt  *ai,*aj;
1332:   PetscScalar     *x,zero = 0.0;
1333:   const MatScalar *aa,*aa_j;

1336:   bs = A->rmap->bs;
1337:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");

1339:   aa   = a->a;
1340:   ambs = a->mbs;

1342:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1343:     PetscInt *diag=a->diag;
1344:     aa   = a->a;
1345:     ambs = a->mbs;
1346:     VecGetArray(v,&x);
1347:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1348:     VecRestoreArray(v,&x);
1349:     return(0);
1350:   }

1352:   ai   = a->i;
1353:   aj   = a->j;
1354:   bs2  = a->bs2;
1355:   VecSet(v,zero);
1356:   if (!a->nz) return(0);
1357:   VecGetArray(v,&x);
1358:   for (i=0; i<ambs; i++) {
1359:     j=ai[i];
1360:     if (aj[j] == i) {    /* if this is a diagonal element */
1361:       row  = i*bs;
1362:       aa_j = aa + j*bs2;
1363:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1364:     }
1365:   }
1366:   VecRestoreArray(v,&x);
1367:   return(0);
1368: }

1370: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1371: {
1372:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1373:   PetscScalar       x;
1374:   const PetscScalar *l,*li,*ri;
1375:   MatScalar         *aa,*v;
1376:   PetscErrorCode    ierr;
1377:   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1378:   const PetscInt    *ai,*aj;
1379:   PetscBool         flg;

1382:   if (ll != rr) {
1383:     VecEqual(ll,rr,&flg);
1384:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1385:   }
1386:   if (!ll) return(0);
1387:   ai  = a->i;
1388:   aj  = a->j;
1389:   aa  = a->a;
1390:   m   = A->rmap->N;
1391:   bs  = A->rmap->bs;
1392:   mbs = a->mbs;
1393:   bs2 = a->bs2;

1395:   VecGetArrayRead(ll,&l);
1396:   VecGetLocalSize(ll,&lm);
1397:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1398:   for (i=0; i<mbs; i++) { /* for each block row */
1399:     M  = ai[i+1] - ai[i];
1400:     li = l + i*bs;
1401:     v  = aa + bs2*ai[i];
1402:     for (j=0; j<M; j++) { /* for each block */
1403:       ri = l + bs*aj[ai[i]+j];
1404:       for (k=0; k<bs; k++) {
1405:         x = ri[k];
1406:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1407:       }
1408:     }
1409:   }
1410:   VecRestoreArrayRead(ll,&l);
1411:   PetscLogFlops(2.0*a->nz);
1412:   return(0);
1413: }

1415: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1416: {
1417:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1420:   info->block_size   = a->bs2;
1421:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1422:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1423:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1424:   info->assemblies   = A->num_ass;
1425:   info->mallocs      = A->info.mallocs;
1426:   info->memory       = ((PetscObject)A)->mem;
1427:   if (A->factortype) {
1428:     info->fill_ratio_given  = A->info.fill_ratio_given;
1429:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1430:     info->factor_mallocs    = A->info.factor_mallocs;
1431:   } else {
1432:     info->fill_ratio_given  = 0;
1433:     info->fill_ratio_needed = 0;
1434:     info->factor_mallocs    = 0;
1435:   }
1436:   return(0);
1437: }

1439: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1440: {
1441:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1445:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1446:   return(0);
1447: }

1449: /*
1450:    This code does not work since it only checks the upper triangular part of
1451:   the matrix. Hence it is not listed in the function table.
1452: */
1453: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1454: {
1455:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1456:   PetscErrorCode  ierr;
1457:   PetscInt        i,j,n,row,col,bs,mbs;
1458:   const PetscInt  *ai,*aj;
1459:   PetscReal       atmp;
1460:   const MatScalar *aa;
1461:   PetscScalar     *x;
1462:   PetscInt        ncols,brow,bcol,krow,kcol;

1465:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1466:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1467:   bs  = A->rmap->bs;
1468:   aa  = a->a;
1469:   ai  = a->i;
1470:   aj  = a->j;
1471:   mbs = a->mbs;

1473:   VecSet(v,0.0);
1474:   VecGetArray(v,&x);
1475:   VecGetLocalSize(v,&n);
1476:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1477:   for (i=0; i<mbs; i++) {
1478:     ncols = ai[1] - ai[0]; ai++;
1479:     brow  = bs*i;
1480:     for (j=0; j<ncols; j++) {
1481:       bcol = bs*(*aj);
1482:       for (kcol=0; kcol<bs; kcol++) {
1483:         col = bcol + kcol;      /* col index */
1484:         for (krow=0; krow<bs; krow++) {
1485:           atmp = PetscAbsScalar(*aa); aa++;
1486:           row  = brow + krow;   /* row index */
1487:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1488:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1489:         }
1490:       }
1491:       aj++;
1492:     }
1493:   }
1494:   VecRestoreArray(v,&x);
1495:   return(0);
1496: }

1498: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1499: {

1503:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1504:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1505:   return(0);
1506: }

1508: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1509: {
1510:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1511:   PetscScalar       *z = c;
1512:   const PetscScalar *xb;
1513:   PetscScalar       x1;
1514:   const MatScalar   *v = a->a,*vv;
1515:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1516: #if defined(PETSC_USE_COMPLEX)
1517:   const int         aconj = A->hermitian;
1518: #else
1519:   const int         aconj = 0;
1520: #endif

1523:   for (i=0; i<mbs; i++) {
1524:     n = ii[1] - ii[0]; ii++;
1525:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1526:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1527:     jj = idx;
1528:     vv = v;
1529:     for (k=0; k<cn; k++) {
1530:       idx = jj;
1531:       v = vv;
1532:       for (j=0; j<n; j++) {
1533:         xb = b + (*idx); x1 = xb[0+k*bm];
1534:         z[0+k*cm] += v[0]*x1;
1535:         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1536:         v += 1;
1537:         ++idx;
1538:       }
1539:     }
1540:     z += 1;
1541:   }
1542:   return(0);
1543: }

1545: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1546: {
1547:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1548:   PetscScalar       *z = c;
1549:   const PetscScalar *xb;
1550:   PetscScalar       x1,x2;
1551:   const MatScalar   *v = a->a,*vv;
1552:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1555:   for (i=0; i<mbs; i++) {
1556:     n = ii[1] - ii[0]; ii++;
1557:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1558:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1559:     jj = idx;
1560:     vv = v;
1561:     for (k=0; k<cn; k++) {
1562:       idx = jj;
1563:       v = vv;
1564:       for (j=0; j<n; j++) {
1565:         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1566:         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1567:         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1568:         if (*idx != i) {
1569:           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1570:           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1571:         }
1572:         v += 4;
1573:         ++idx;
1574:       }
1575:     }
1576:     z += 2;
1577:   }
1578:   return(0);
1579: }

1581: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1582: {
1583:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1584:   PetscScalar       *z = c;
1585:   const PetscScalar *xb;
1586:   PetscScalar       x1,x2,x3;
1587:   const MatScalar   *v = a->a,*vv;
1588:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1591:   for (i=0; i<mbs; i++) {
1592:     n = ii[1] - ii[0]; ii++;
1593:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1594:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1595:     jj = idx;
1596:     vv = v;
1597:     for (k=0; k<cn; k++) {
1598:       idx = jj;
1599:       v = vv;
1600:       for (j=0; j<n; j++) {
1601:         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1602:         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1603:         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1604:         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1605:         if (*idx != i) {
1606:           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1607:           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1608:           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1609:         }
1610:         v += 9;
1611:         ++idx;
1612:       }
1613:     }
1614:     z += 3;
1615:   }
1616:   return(0);
1617: }

1619: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1620: {
1621:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1622:   PetscScalar       *z = c;
1623:   const PetscScalar *xb;
1624:   PetscScalar       x1,x2,x3,x4;
1625:   const MatScalar   *v = a->a,*vv;
1626:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1629:   for (i=0; i<mbs; i++) {
1630:     n = ii[1] - ii[0]; ii++;
1631:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1632:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1633:     jj = idx;
1634:     vv = v;
1635:     for (k=0; k<cn; k++) {
1636:       idx = jj;
1637:       v = vv;
1638:       for (j=0; j<n; j++) {
1639:         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1640:         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1641:         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1642:         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1643:         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1644:         if (*idx != i) {
1645:           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1646:           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1647:           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1648:           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1649:         }
1650:         v += 16;
1651:         ++idx;
1652:       }
1653:     }
1654:     z += 4;
1655:   }
1656:   return(0);
1657: }

1659: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1660: {
1661:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1662:   PetscScalar       *z = c;
1663:   const PetscScalar *xb;
1664:   PetscScalar       x1,x2,x3,x4,x5;
1665:   const MatScalar   *v = a->a,*vv;
1666:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1669:   for (i=0; i<mbs; i++) {
1670:     n = ii[1] - ii[0]; ii++;
1671:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1672:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1673:     jj = idx;
1674:     vv = v;
1675:     for (k=0; k<cn; k++) {
1676:       idx = jj;
1677:       v = vv;
1678:       for (j=0; j<n; j++) {
1679:         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1680:         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1681:         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1682:         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1683:         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1684:         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1685:         if (*idx != i) {
1686:           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1687:           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1688:           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1689:           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1690:           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1691:         }
1692:         v += 25;
1693:         ++idx;
1694:       }
1695:     }
1696:     z += 5;
1697:   }
1698:   return(0);
1699: }

1701: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1702: {
1703:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1704:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1705:   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
1706:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1707:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1708:   PetscBLASInt      bbs,bcn,bbm,bcm;
1709:   PetscScalar       *z = 0;
1710:   PetscScalar       *c,*b;
1711:   const MatScalar   *v;
1712:   const PetscInt    *idx,*ii;
1713:   PetscScalar       _DOne=1.0;
1714:   PetscErrorCode    ierr;

1717:   if (!cm || !cn) return(0);
1718:   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1719:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1720:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1721:   b = bd->v;
1722:   MatZeroEntries(C);
1723:   MatDenseGetArray(C,&c);
1724:   switch (bs) {
1725:   case 1:
1726:     MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1727:     break;
1728:   case 2:
1729:     MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1730:     break;
1731:   case 3:
1732:     MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1733:     break;
1734:   case 4:
1735:     MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1736:     break;
1737:   case 5:
1738:     MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1739:     break;
1740:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1741:     PetscBLASIntCast(bs,&bbs);
1742:     PetscBLASIntCast(cn,&bcn);
1743:     PetscBLASIntCast(bm,&bbm);
1744:     PetscBLASIntCast(cm,&bcm);
1745:     idx = a->j;
1746:     v   = a->a;
1747:     mbs = a->mbs;
1748:     ii  = a->i;
1749:     z   = c;
1750:     for (i=0; i<mbs; i++) {
1751:       n = ii[1] - ii[0]; ii++;
1752:       for (j=0; j<n; j++) {
1753:         if (*idx != i)
1754:           PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1755:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1756:         v += bs2;
1757:       }
1758:       z += bs;
1759:     }
1760:   }
1761:   MatDenseRestoreArray(C,&c);
1762:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1763:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1764:   PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);
1765:   return(0);
1766: }