Actual source code: sbaij2.c

petsc-3.12.5 2020-03-29
Report Typos and Errors

  2:  #include <../src/mat/impls/baij/seq/baij.h>
  3:  #include <petsc/private/kernels/blockinvert.h>
  4:  #include <petscbt.h>
  5:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6:  #include <petscblaslapack.h>

  8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
  9: {
 10:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 12:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 13:   const PetscInt *idx;
 14:   PetscBT        table_out,table_in;

 17:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 18:   mbs  = a->mbs;
 19:   ai   = a->i;
 20:   aj   = a->j;
 21:   bs   = A->rmap->bs;
 22:   PetscBTCreate(mbs,&table_out);
 23:   PetscMalloc1(mbs+1,&nidx);
 24:   PetscMalloc1(A->rmap->N+1,&nidx2);
 25:   PetscBTCreate(mbs,&table_in);

 27:   for (i=0; i<is_max; i++) { /* for each is */
 28:     isz  = 0;
 29:     PetscBTMemzero(mbs,table_out);

 31:     /* Extract the indices, assume there can be duplicate entries */
 32:     ISGetIndices(is[i],&idx);
 33:     ISGetLocalSize(is[i],&n);

 35:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 36:     bcol_max = 0;
 37:     for (j=0; j<n; ++j) {
 38:       brow = idx[j]/bs; /* convert the indices into block indices */
 39:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 40:       if (!PetscBTLookupSet(table_out,brow)) {
 41:         nidx[isz++] = brow;
 42:         if (bcol_max < brow) bcol_max = brow;
 43:       }
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(&is[i]);

 48:     k = 0;
 49:     for (j=0; j<ov; j++) { /* for each overlap */
 50:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 51:       PetscBTMemzero(mbs,table_in);
 52:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 54:       n = isz;  /* length of the updated is[i] */
 55:       for (brow=0; brow<mbs; brow++) {
 56:         start = ai[brow]; end   = ai[brow+1];
 57:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 58:           for (l = start; l<end; l++) {
 59:             bcol = aj[l];
 60:             if (!PetscBTLookupSet(table_out,bcol)) {
 61:               nidx[isz++] = bcol;
 62:               if (bcol_max < bcol) bcol_max = bcol;
 63:             }
 64:           }
 65:           k++;
 66:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 67:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 68:           for (l = start; l<end; l++) {
 69:             bcol = aj[l];
 70:             if (bcol > bcol_max) break;
 71:             if (PetscBTLookup(table_in,bcol)) {
 72:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 73:               break; /* for l = start; l<end ; l++) */
 74:             }
 75:           }
 76:         }
 77:       }
 78:     } /* for each overlap */

 80:     /* expand the Index Set */
 81:     for (j=0; j<isz; j++) {
 82:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 83:     }
 84:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 85:   }
 86:   PetscBTDestroy(&table_out);
 87:   PetscFree(nidx);
 88:   PetscFree(nidx2);
 89:   PetscBTDestroy(&table_in);
 90:   return(0);
 91: }

 93: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 94:         Zero some ops' to avoid invalid usse */
 95: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 96: {

100:   MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
101:   Bseq->ops->mult                   = 0;
102:   Bseq->ops->multadd                = 0;
103:   Bseq->ops->multtranspose          = 0;
104:   Bseq->ops->multtransposeadd       = 0;
105:   Bseq->ops->lufactor               = 0;
106:   Bseq->ops->choleskyfactor         = 0;
107:   Bseq->ops->lufactorsymbolic       = 0;
108:   Bseq->ops->choleskyfactorsymbolic = 0;
109:   Bseq->ops->getinertia             = 0;
110:   return(0);
111: }

113: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
114: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115: {
116:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
120:   const PetscInt *irow,*icol;
121:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122:   PetscInt       *aj = a->j,*ai = a->i;
123:   MatScalar      *mat_a;
124:   Mat            C;
125:   PetscBool      flag;


129:   ISGetIndices(isrow,&irow);
130:   ISGetIndices(iscol,&icol);
131:   ISGetLocalSize(isrow,&nrows);
132:   ISGetLocalSize(iscol,&ncols);

134:   PetscCalloc1(1+oldcols,&smap);
135:   ssmap = smap;
136:   PetscMalloc1(1+nrows,&lens);
137:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138:   /* determine lens of each row */
139:   for (i=0; i<nrows; i++) {
140:     kstart  = ai[irow[i]];
141:     kend    = kstart + a->ilen[irow[i]];
142:     lens[i] = 0;
143:     for (k=kstart; k<kend; k++) {
144:       if (ssmap[aj[k]]) lens[i]++;
145:     }
146:   }
147:   /* Create and fill new matrix */
148:   if (scall == MAT_REUSE_MATRIX) {
149:     c = (Mat_SeqSBAIJ*)((*B)->data);

151:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
153:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154:     PetscArrayzero(c->ilen,c->mbs);
155:     C    = *B;
156:   } else {
157:     MatCreate(PetscObjectComm((PetscObject)A),&C);
158:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
159:     MatSetType(C,((PetscObject)A)->type_name);
160:     MatSeqSBAIJSetPreallocation(C,bs,0,lens);
161:   }
162:   c = (Mat_SeqSBAIJ*)(C->data);
163:   for (i=0; i<nrows; i++) {
164:     row      = irow[i];
165:     kstart   = ai[row];
166:     kend     = kstart + a->ilen[row];
167:     mat_i    = c->i[i];
168:     mat_j    = c->j + mat_i;
169:     mat_a    = c->a + mat_i*bs2;
170:     mat_ilen = c->ilen + i;
171:     for (k=kstart; k<kend; k++) {
172:       if ((tcol=ssmap[a->j[k]])) {
173:         *mat_j++ = tcol - 1;
174:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
175:         mat_a   += bs2;
176:         (*mat_ilen)++;
177:       }
178:     }
179:   }
180:   /* sort */
181:   {
182:     MatScalar *work;

184:     PetscMalloc1(bs2,&work);
185:     for (i=0; i<nrows; i++) {
186:       PetscInt ilen;
187:       mat_i = c->i[i];
188:       mat_j = c->j + mat_i;
189:       mat_a = c->a + mat_i*bs2;
190:       ilen  = c->ilen[i];
191:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
192:     }
193:     PetscFree(work);
194:   }

196:   /* Free work space */
197:   ISRestoreIndices(iscol,&icol);
198:   PetscFree(smap);
199:   PetscFree(lens);
200:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
201:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

203:   ISRestoreIndices(isrow,&irow);
204:   *B   = C;
205:   return(0);
206: }

208: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
209: {
210:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
211:   IS             is1,is2;
213:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214:   const PetscInt *irow,*icol;

217:   ISGetIndices(isrow,&irow);
218:   ISGetIndices(iscol,&icol);
219:   ISGetLocalSize(isrow,&nrows);
220:   ISGetLocalSize(iscol,&ncols);

222:   /* Verify if the indices corespond to each element in a block
223:    and form the IS with compressed IS */
224:   maxmnbs = PetscMax(a->mbs,a->nbs);
225:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
226:   PetscArrayzero(vary,a->mbs);
227:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228:   for (i=0; i<a->mbs; i++) {
229:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230:   }
231:   count = 0;
232:   for (i=0; i<nrows; i++) {
233:     PetscInt j = irow[i] / bs;
234:     if ((vary[j]--)==bs) iary[count++] = j;
235:   }
236:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

238:   PetscArrayzero(vary,a->nbs);
239:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240:   for (i=0; i<a->nbs; i++) {
241:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242:   }
243:   count = 0;
244:   for (i=0; i<ncols; i++) {
245:     PetscInt j = icol[i] / bs;
246:     if ((vary[j]--)==bs) iary[count++] = j;
247:   }
248:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
249:   ISRestoreIndices(isrow,&irow);
250:   ISRestoreIndices(iscol,&icol);
251:   PetscFree2(vary,iary);

253:   MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
254:   ISDestroy(&is1);
255:   ISDestroy(&is2);

257:   if (isrow != iscol) {
258:     PetscBool isequal;
259:     ISEqual(isrow,iscol,&isequal);
260:     if (!isequal) {
261:       MatSeqSBAIJZeroOps_Private(*B);
262:     }
263:   }
264:   return(0);
265: }

267: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
268: {
270:   PetscInt       i;

273:   if (scall == MAT_INITIAL_MATRIX) {
274:     PetscCalloc1(n+1,B);
275:   }

277:   for (i=0; i<n; i++) {
278:     MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
279:   }
280:   return(0);
281: }

283: /* -------------------------------------------------------*/
284: /* Should check that shapes of vectors and matrices match */
285: /* -------------------------------------------------------*/

287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288: {
289:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
290:   PetscScalar       *z,x1,x2,zero=0.0;
291:   const PetscScalar *x,*xb;
292:   const MatScalar   *v;
293:   PetscErrorCode    ierr;
294:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
295:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
296:   PetscInt          nonzerorow=0;

299:   VecSet(zz,zero);
300:   if (!a->nz) return(0);
301:   VecGetArrayRead(xx,&x);
302:   VecGetArray(zz,&z);

304:   v  = a->a;
305:   xb = x;

307:   for (i=0; i<mbs; i++) {
308:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
309:     x1          = xb[0]; x2 = xb[1];
310:     ib          = aj + *ai;
311:     jmin        = 0;
312:     nonzerorow += (n>0);
313:     if (*ib == i) {     /* (diag of A)*x */
314:       z[2*i]   += v[0]*x1 + v[2]*x2;
315:       z[2*i+1] += v[2]*x1 + v[3]*x2;
316:       v        += 4; jmin++;
317:     }
318:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
319:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
320:     for (j=jmin; j<n; j++) {
321:       /* (strict lower triangular part of A)*x  */
322:       cval       = ib[j]*2;
323:       z[cval]   += v[0]*x1 + v[1]*x2;
324:       z[cval+1] += v[2]*x1 + v[3]*x2;
325:       /* (strict upper triangular part of A)*x  */
326:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
327:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
328:       v        += 4;
329:     }
330:     xb +=2; ai++;
331:   }

333:   VecRestoreArrayRead(xx,&x);
334:   VecRestoreArray(zz,&z);
335:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
336:   return(0);
337: }

339: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
340: {
341:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
342:   PetscScalar       *z,x1,x2,x3,zero=0.0;
343:   const PetscScalar *x,*xb;
344:   const MatScalar   *v;
345:   PetscErrorCode    ierr;
346:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
347:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
348:   PetscInt          nonzerorow=0;

351:   VecSet(zz,zero);
352:   if (!a->nz) return(0);
353:   VecGetArrayRead(xx,&x);
354:   VecGetArray(zz,&z);

356:   v  = a->a;
357:   xb = x;

359:   for (i=0; i<mbs; i++) {
360:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
361:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
362:     ib          = aj + *ai;
363:     jmin        = 0;
364:     nonzerorow += (n>0);
365:     if (*ib == i) {     /* (diag of A)*x */
366:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
367:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
368:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
369:       v        += 9; jmin++;
370:     }
371:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
373:     for (j=jmin; j<n; j++) {
374:       /* (strict lower triangular part of A)*x  */
375:       cval       = ib[j]*3;
376:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
377:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
378:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
379:       /* (strict upper triangular part of A)*x  */
380:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
381:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
382:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
383:       v        += 9;
384:     }
385:     xb +=3; ai++;
386:   }

388:   VecRestoreArrayRead(xx,&x);
389:   VecRestoreArray(zz,&z);
390:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
391:   return(0);
392: }

394: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
395: {
396:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
397:   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
398:   const PetscScalar *x,*xb;
399:   const MatScalar   *v;
400:   PetscErrorCode    ierr;
401:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
402:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
403:   PetscInt          nonzerorow = 0;

406:   VecSet(zz,zero);
407:   if (!a->nz) return(0);
408:   VecGetArrayRead(xx,&x);
409:   VecGetArray(zz,&z);

411:   v  = a->a;
412:   xb = x;

414:   for (i=0; i<mbs; i++) {
415:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
416:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417:     ib          = aj + *ai;
418:     jmin        = 0;
419:     nonzerorow += (n>0);
420:     if (*ib == i) {     /* (diag of A)*x */
421:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
422:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
423:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
424:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
425:       v        += 16; jmin++;
426:     }
427:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
428:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429:     for (j=jmin; j<n; j++) {
430:       /* (strict lower triangular part of A)*x  */
431:       cval       = ib[j]*4;
432:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
433:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
434:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
435:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
436:       /* (strict upper triangular part of A)*x  */
437:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
438:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
439:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
440:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
441:       v        += 16;
442:     }
443:     xb +=4; ai++;
444:   }

446:   VecRestoreArrayRead(xx,&x);
447:   VecRestoreArray(zz,&z);
448:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
449:   return(0);
450: }

452: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
453: {
454:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
455:   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
456:   const PetscScalar *x,*xb;
457:   const MatScalar   *v;
458:   PetscErrorCode    ierr;
459:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
460:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
461:   PetscInt          nonzerorow=0;

464:   VecSet(zz,zero);
465:   if (!a->nz) return(0);
466:   VecGetArrayRead(xx,&x);
467:   VecGetArray(zz,&z);

469:   v  = a->a;
470:   xb = x;

472:   for (i=0; i<mbs; i++) {
473:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
474:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
475:     ib          = aj + *ai;
476:     jmin        = 0;
477:     nonzerorow += (n>0);
478:     if (*ib == i) {      /* (diag of A)*x */
479:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
480:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
481:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
482:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
483:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
484:       v        += 25; jmin++;
485:     }
486:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
487:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488:     for (j=jmin; j<n; j++) {
489:       /* (strict lower triangular part of A)*x  */
490:       cval       = ib[j]*5;
491:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
492:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
493:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
494:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
495:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
496:       /* (strict upper triangular part of A)*x  */
497:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
498:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
499:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
500:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
501:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
502:       v        += 25;
503:     }
504:     xb +=5; ai++;
505:   }

507:   VecRestoreArrayRead(xx,&x);
508:   VecRestoreArray(zz,&z);
509:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
510:   return(0);
511: }


514: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518:   const PetscScalar *x,*xb;
519:   const MatScalar   *v;
520:   PetscErrorCode    ierr;
521:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523:   PetscInt          nonzerorow=0;

526:   VecSet(zz,zero);
527:   if (!a->nz) return(0);
528:   VecGetArrayRead(xx,&x);
529:   VecGetArray(zz,&z);

531:   v  = a->a;
532:   xb = x;

534:   for (i=0; i<mbs; i++) {
535:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537:     ib          = aj + *ai;
538:     jmin        = 0;
539:     nonzerorow += (n>0);
540:     if (*ib == i) {      /* (diag of A)*x */
541:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547:       v        += 36; jmin++;
548:     }
549:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551:     for (j=jmin; j<n; j++) {
552:       /* (strict lower triangular part of A)*x  */
553:       cval       = ib[j]*6;
554:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560:       /* (strict upper triangular part of A)*x  */
561:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567:       v        += 36;
568:     }
569:     xb +=6; ai++;
570:   }

572:   VecRestoreArrayRead(xx,&x);
573:   VecRestoreArray(zz,&z);
574:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
575:   return(0);
576: }
577: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
578: {
579:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
580:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
581:   const PetscScalar *x,*xb;
582:   const MatScalar   *v;
583:   PetscErrorCode    ierr;
584:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
585:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
586:   PetscInt          nonzerorow=0;

589:   VecSet(zz,zero);
590:   if (!a->nz) return(0);
591:   VecGetArrayRead(xx,&x);
592:   VecGetArray(zz,&z);

594:   v  = a->a;
595:   xb = x;

597:   for (i=0; i<mbs; i++) {
598:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
599:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
600:     ib          = aj + *ai;
601:     jmin        = 0;
602:     nonzerorow += (n>0);
603:     if (*ib == i) {      /* (diag of A)*x */
604:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
605:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
606:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
607:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
608:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
609:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
610:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
611:       v        += 49; jmin++;
612:     }
613:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
614:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
615:     for (j=jmin; j<n; j++) {
616:       /* (strict lower triangular part of A)*x  */
617:       cval       = ib[j]*7;
618:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
619:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
620:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
621:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
622:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
623:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
624:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
625:       /* (strict upper triangular part of A)*x  */
626:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
627:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
628:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
629:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
630:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
631:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
632:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
633:       v       += 49;
634:     }
635:     xb +=7; ai++;
636:   }
637:   VecRestoreArrayRead(xx,&x);
638:   VecRestoreArray(zz,&z);
639:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
640:   return(0);
641: }

643: /*
644:     This will not work with MatScalar == float because it calls the BLAS
645: */
646: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
647: {
648:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
649:   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
650:   const PetscScalar *x,*x_ptr,*xb;
651:   const MatScalar   *v;
652:   PetscErrorCode    ierr;
653:   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
654:   const PetscInt    *idx,*aj,*ii;
655:   PetscInt          nonzerorow=0;

658:   VecSet(zz,zero);
659:   if (!a->nz) return(0);
660:   VecGetArrayRead(xx,&x);x_ptr = x;
661:   VecGetArray(zz,&z); z_ptr=z;

663:   aj = a->j;
664:   v  = a->a;
665:   ii = a->i;

667:   if (!a->mult_work) {
668:     PetscMalloc1(A->rmap->N+1,&a->mult_work);
669:   }
670:   work = a->mult_work;

672:   for (i=0; i<mbs; i++) {
673:     n           = ii[1] - ii[0]; ncols = n*bs;
674:     workt       = work; idx=aj+ii[0];
675:     nonzerorow += (n>0);

677:     /* upper triangular part */
678:     for (j=0; j<n; j++) {
679:       xb = x_ptr + bs*(*idx++);
680:       for (k=0; k<bs; k++) workt[k] = xb[k];
681:       workt += bs;
682:     }
683:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
684:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

686:     /* strict lower triangular part */
687:     idx = aj+ii[0];
688:     if (*idx == i) {
689:       ncols -= bs; v += bs2; idx++; n--;
690:     }

692:     if (ncols > 0) {
693:       workt = work;
694:       PetscArrayzero(workt,ncols);
695:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
696:       for (j=0; j<n; j++) {
697:         zb = z_ptr + bs*(*idx++);
698:         for (k=0; k<bs; k++) zb[k] += workt[k];
699:         workt += bs;
700:       }
701:     }
702:     x += bs; v += n*bs2; z += bs; ii++;
703:   }

705:   VecRestoreArrayRead(xx,&x);
706:   VecRestoreArray(zz,&z);
707:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
708:   return(0);
709: }

711: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
712: {
713:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
714:   PetscScalar       *z,x1;
715:   const PetscScalar *x,*xb;
716:   const MatScalar   *v;
717:   PetscErrorCode    ierr;
718:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
719:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
720:   PetscInt          nonzerorow=0;

723:   VecCopy(yy,zz);
724:   if (!a->nz) return(0);
725:   VecGetArrayRead(xx,&x);
726:   VecGetArray(zz,&z);
727:   v    = a->a;
728:   xb   = x;

730:   for (i=0; i<mbs; i++) {
731:     n           = ai[1] - ai[0]; /* length of i_th row of A */
732:     x1          = xb[0];
733:     ib          = aj + *ai;
734:     jmin        = 0;
735:     nonzerorow += (n>0);
736:     if (*ib == i) {            /* (diag of A)*x */
737:       z[i] += *v++ * x[*ib++]; jmin++;
738:     }
739:     for (j=jmin; j<n; j++) {
740:       cval    = *ib;
741:       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
742:       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
743:     }
744:     xb++; ai++;
745:   }

747:   VecRestoreArrayRead(xx,&x);
748:   VecRestoreArray(zz,&z);

750:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
751:   return(0);
752: }

754: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
755: {
756:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
757:   PetscScalar       *z,x1,x2;
758:   const PetscScalar *x,*xb;
759:   const MatScalar   *v;
760:   PetscErrorCode    ierr;
761:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
762:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
763:   PetscInt          nonzerorow=0;

766:   VecCopy(yy,zz);
767:   if (!a->nz) return(0);
768:   VecGetArrayRead(xx,&x);
769:   VecGetArray(zz,&z);

771:   v  = a->a;
772:   xb = x;

774:   for (i=0; i<mbs; i++) {
775:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
776:     x1          = xb[0]; x2 = xb[1];
777:     ib          = aj + *ai;
778:     jmin        = 0;
779:     nonzerorow += (n>0);
780:     if (*ib == i) {      /* (diag of A)*x */
781:       z[2*i]   += v[0]*x1 + v[2]*x2;
782:       z[2*i+1] += v[2]*x1 + v[3]*x2;
783:       v        += 4; jmin++;
784:     }
785:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
786:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
787:     for (j=jmin; j<n; j++) {
788:       /* (strict lower triangular part of A)*x  */
789:       cval       = ib[j]*2;
790:       z[cval]   += v[0]*x1 + v[1]*x2;
791:       z[cval+1] += v[2]*x1 + v[3]*x2;
792:       /* (strict upper triangular part of A)*x  */
793:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
794:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
795:       v        += 4;
796:     }
797:     xb +=2; ai++;
798:   }
799:   VecRestoreArrayRead(xx,&x);
800:   VecRestoreArray(zz,&z);

802:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
803:   return(0);
804: }

806: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
807: {
808:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
809:   PetscScalar       *z,x1,x2,x3;
810:   const PetscScalar *x,*xb;
811:   const MatScalar   *v;
812:   PetscErrorCode    ierr;
813:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
814:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
815:   PetscInt          nonzerorow=0;

818:   VecCopy(yy,zz);
819:   if (!a->nz) return(0);
820:   VecGetArrayRead(xx,&x);
821:   VecGetArray(zz,&z);

823:   v  = a->a;
824:   xb = x;

826:   for (i=0; i<mbs; i++) {
827:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
828:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
829:     ib          = aj + *ai;
830:     jmin        = 0;
831:     nonzerorow += (n>0);
832:     if (*ib == i) {     /* (diag of A)*x */
833:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
834:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
835:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836:       v        += 9; jmin++;
837:     }
838:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
839:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
840:     for (j=jmin; j<n; j++) {
841:       /* (strict lower triangular part of A)*x  */
842:       cval       = ib[j]*3;
843:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
844:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
845:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
846:       /* (strict upper triangular part of A)*x  */
847:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
848:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
849:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
850:       v        += 9;
851:     }
852:     xb +=3; ai++;
853:   }

855:   VecRestoreArrayRead(xx,&x);
856:   VecRestoreArray(zz,&z);

858:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
859:   return(0);
860: }

862: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863: {
864:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
865:   PetscScalar       *z,x1,x2,x3,x4;
866:   const PetscScalar *x,*xb;
867:   const MatScalar   *v;
868:   PetscErrorCode    ierr;
869:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
870:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
871:   PetscInt          nonzerorow=0;

874:   VecCopy(yy,zz);
875:   if (!a->nz) return(0);
876:   VecGetArrayRead(xx,&x);
877:   VecGetArray(zz,&z);

879:   v  = a->a;
880:   xb = x;

882:   for (i=0; i<mbs; i++) {
883:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
884:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885:     ib          = aj + *ai;
886:     jmin        = 0;
887:     nonzerorow += (n>0);
888:     if (*ib == i) {      /* (diag of A)*x */
889:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
890:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
891:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
892:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893:       v        += 16; jmin++;
894:     }
895:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897:     for (j=jmin; j<n; j++) {
898:       /* (strict lower triangular part of A)*x  */
899:       cval       = ib[j]*4;
900:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
901:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
902:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
903:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
904:       /* (strict upper triangular part of A)*x  */
905:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
906:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
907:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
908:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
909:       v        += 16;
910:     }
911:     xb +=4; ai++;
912:   }

914:   VecRestoreArrayRead(xx,&x);
915:   VecRestoreArray(zz,&z);

917:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
918:   return(0);
919: }

921: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
922: {
923:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
924:   PetscScalar       *z,x1,x2,x3,x4,x5;
925:   const PetscScalar *x,*xb;
926:   const MatScalar   *v;
927:   PetscErrorCode    ierr;
928:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
929:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
930:   PetscInt          nonzerorow=0;

933:   VecCopy(yy,zz);
934:   if (!a->nz) return(0);
935:   VecGetArrayRead(xx,&x);
936:   VecGetArray(zz,&z);

938:   v  = a->a;
939:   xb = x;

941:   for (i=0; i<mbs; i++) {
942:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
943:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
944:     ib          = aj + *ai;
945:     jmin        = 0;
946:     nonzerorow += (n>0);
947:     if (*ib == i) {      /* (diag of A)*x */
948:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
949:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
950:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
951:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
952:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953:       v        += 25; jmin++;
954:     }
955:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
956:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
957:     for (j=jmin; j<n; j++) {
958:       /* (strict lower triangular part of A)*x  */
959:       cval       = ib[j]*5;
960:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
961:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
962:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
963:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
964:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
965:       /* (strict upper triangular part of A)*x  */
966:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
967:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
968:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
969:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
970:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
971:       v        += 25;
972:     }
973:     xb +=5; ai++;
974:   }

976:   VecRestoreArrayRead(xx,&x);
977:   VecRestoreArray(zz,&z);

979:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
980:   return(0);
981: }
982: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
983: {
984:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
985:   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
986:   const PetscScalar *x,*xb;
987:   const MatScalar   *v;
988:   PetscErrorCode    ierr;
989:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
990:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
991:   PetscInt          nonzerorow=0;

994:   VecCopy(yy,zz);
995:   if (!a->nz) return(0);
996:   VecGetArrayRead(xx,&x);
997:   VecGetArray(zz,&z);

999:   v  = a->a;
1000:   xb = x;

1002:   for (i=0; i<mbs; i++) {
1003:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1004:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1005:     ib          = aj + *ai;
1006:     jmin        = 0;
1007:     nonzerorow += (n>0);
1008:     if (*ib == i) {     /* (diag of A)*x */
1009:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1010:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1011:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1012:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1013:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1014:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015:       v        += 36; jmin++;
1016:     }
1017:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019:     for (j=jmin; j<n; j++) {
1020:       /* (strict lower triangular part of A)*x  */
1021:       cval       = ib[j]*6;
1022:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1023:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1024:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1025:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1026:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1027:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1028:       /* (strict upper triangular part of A)*x  */
1029:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1030:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1031:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1032:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1033:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1034:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1035:       v        += 36;
1036:     }
1037:     xb +=6; ai++;
1038:   }

1040:   VecRestoreArrayRead(xx,&x);
1041:   VecRestoreArray(zz,&z);

1043:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1044:   return(0);
1045: }

1047: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1048: {
1049:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1050:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1051:   const PetscScalar *x,*xb;
1052:   const MatScalar   *v;
1053:   PetscErrorCode    ierr;
1054:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1055:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1056:   PetscInt          nonzerorow=0;

1059:   VecCopy(yy,zz);
1060:   if (!a->nz) return(0);
1061:   VecGetArrayRead(xx,&x);
1062:   VecGetArray(zz,&z);

1064:   v  = a->a;
1065:   xb = x;

1067:   for (i=0; i<mbs; i++) {
1068:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1069:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1070:     ib          = aj + *ai;
1071:     jmin        = 0;
1072:     nonzerorow += (n>0);
1073:     if (*ib == i) {     /* (diag of A)*x */
1074:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1075:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1076:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1077:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1078:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1079:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1080:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081:       v        += 49; jmin++;
1082:     }
1083:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1084:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085:     for (j=jmin; j<n; j++) {
1086:       /* (strict lower triangular part of A)*x  */
1087:       cval       = ib[j]*7;
1088:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1089:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1090:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1091:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1092:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1093:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1094:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1095:       /* (strict upper triangular part of A)*x  */
1096:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1097:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1098:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1099:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1100:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1101:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1102:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1103:       v       += 49;
1104:     }
1105:     xb +=7; ai++;
1106:   }

1108:   VecRestoreArrayRead(xx,&x);
1109:   VecRestoreArray(zz,&z);

1111:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1112:   return(0);
1113: }

1115: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1116: {
1117:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1118:   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1119:   const PetscScalar *x,*x_ptr,*xb;
1120:   const MatScalar   *v;
1121:   PetscErrorCode    ierr;
1122:   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1123:   const PetscInt    *idx,*aj,*ii;
1124:   PetscInt          nonzerorow=0;

1127:   VecCopy(yy,zz);
1128:   if (!a->nz) return(0);
1129:   VecGetArrayRead(xx,&x); x_ptr=x;
1130:   VecGetArray(zz,&z); z_ptr=z;

1132:   aj = a->j;
1133:   v  = a->a;
1134:   ii = a->i;

1136:   if (!a->mult_work) {
1137:     PetscMalloc1(A->rmap->n+1,&a->mult_work);
1138:   }
1139:   work = a->mult_work;


1142:   for (i=0; i<mbs; i++) {
1143:     n           = ii[1] - ii[0]; ncols = n*bs;
1144:     workt       = work; idx=aj+ii[0];
1145:     nonzerorow += (n>0);

1147:     /* upper triangular part */
1148:     for (j=0; j<n; j++) {
1149:       xb = x_ptr + bs*(*idx++);
1150:       for (k=0; k<bs; k++) workt[k] = xb[k];
1151:       workt += bs;
1152:     }
1153:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1154:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1156:     /* strict lower triangular part */
1157:     idx = aj+ii[0];
1158:     if (*idx == i) {
1159:       ncols -= bs; v += bs2; idx++; n--;
1160:     }
1161:     if (ncols > 0) {
1162:       workt = work;
1163:       PetscArrayzero(workt,ncols);
1164:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1165:       for (j=0; j<n; j++) {
1166:         zb = z_ptr + bs*(*idx++);
1167:         for (k=0; k<bs; k++) zb[k] += workt[k];
1168:         workt += bs;
1169:       }
1170:     }

1172:     x += bs; v += n*bs2; z += bs; ii++;
1173:   }

1175:   VecRestoreArrayRead(xx,&x);
1176:   VecRestoreArray(zz,&z);

1178:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1179:   return(0);
1180: }

1182: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1183: {
1184:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1185:   PetscScalar    oalpha = alpha;
1187:   PetscBLASInt   one = 1,totalnz;

1190:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1191:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1192:   PetscLogFlops(totalnz);
1193:   return(0);
1194: }

1196: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1197: {
1198:   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1199:   const MatScalar *v       = a->a;
1200:   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1201:   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1202:   PetscErrorCode  ierr;
1203:   const PetscInt  *aj=a->j,*col;

1206:   if (!a->nz) {
1207:     *norm = 0.0;
1208:     return(0);
1209:   }
1210:   if (type == NORM_FROBENIUS) {
1211:     for (k=0; k<mbs; k++) {
1212:       jmin = a->i[k]; jmax = a->i[k+1];
1213:       col  = aj + jmin;
1214:       if (*col == k) {         /* diagonal block */
1215:         for (i=0; i<bs2; i++) {
1216:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1217:         }
1218:         jmin++;
1219:       }
1220:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1221:         for (i=0; i<bs2; i++) {
1222:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1223:         }
1224:       }
1225:     }
1226:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1227:     PetscLogFlops(2*bs2*a->nz);
1228:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1229:     PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1230:     for (i=0; i<mbs; i++) jl[i] = mbs;
1231:     il[0] = 0;

1233:     *norm = 0.0;
1234:     for (k=0; k<mbs; k++) { /* k_th block row */
1235:       for (j=0; j<bs; j++) sum[j]=0.0;
1236:       /*-- col sum --*/
1237:       i = jl[k]; /* first |A(i,k)| to be added */
1238:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1239:                   at step k */
1240:       while (i<mbs) {
1241:         nexti = jl[i];  /* next block row to be added */
1242:         ik    = il[i];  /* block index of A(i,k) in the array a */
1243:         for (j=0; j<bs; j++) {
1244:           v = a->a + ik*bs2 + j*bs;
1245:           for (k1=0; k1<bs; k1++) {
1246:             sum[j] += PetscAbsScalar(*v); v++;
1247:           }
1248:         }
1249:         /* update il, jl */
1250:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1251:         jmax = a->i[i+1];
1252:         if (jmin < jmax) {
1253:           il[i] = jmin;
1254:           j     = a->j[jmin];
1255:           jl[i] = jl[j]; jl[j]=i;
1256:         }
1257:         i = nexti;
1258:       }
1259:       /*-- row sum --*/
1260:       jmin = a->i[k]; jmax = a->i[k+1];
1261:       for (i=jmin; i<jmax; i++) {
1262:         for (j=0; j<bs; j++) {
1263:           v = a->a + i*bs2 + j;
1264:           for (k1=0; k1<bs; k1++) {
1265:             sum[j] += PetscAbsScalar(*v); v += bs;
1266:           }
1267:         }
1268:       }
1269:       /* add k_th block row to il, jl */
1270:       col = aj+jmin;
1271:       if (*col == k) jmin++;
1272:       if (jmin < jmax) {
1273:         il[k] = jmin;
1274:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1275:       }
1276:       for (j=0; j<bs; j++) {
1277:         if (sum[j] > *norm) *norm = sum[j];
1278:       }
1279:     }
1280:     PetscFree3(sum,il,jl);
1281:     PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1282:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1283:   return(0);
1284: }

1286: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1287: {
1288:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1292:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1293:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1294:     *flg = PETSC_FALSE;
1295:     return(0);
1296:   }

1298:   /* if the a->i are the same */
1299:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1300:   if (!*flg) return(0);

1302:   /* if a->j are the same */
1303:   PetscArraycmp(a->j,b->j,a->nz,flg);
1304:   if (!*flg) return(0);

1306:   /* if a->a are the same */
1307:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1308:   return(0);
1309: }

1311: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1312: {
1313:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1314:   PetscErrorCode  ierr;
1315:   PetscInt        i,j,k,row,bs,ambs,bs2;
1316:   const PetscInt  *ai,*aj;
1317:   PetscScalar     *x,zero = 0.0;
1318:   const MatScalar *aa,*aa_j;

1321:   bs = A->rmap->bs;
1322:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");

1324:   aa   = a->a;
1325:   ambs = a->mbs;

1327:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1328:     PetscInt *diag=a->diag;
1329:     aa   = a->a;
1330:     ambs = a->mbs;
1331:     VecGetArray(v,&x);
1332:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1333:     VecRestoreArray(v,&x);
1334:     return(0);
1335:   }

1337:   ai   = a->i;
1338:   aj   = a->j;
1339:   bs2  = a->bs2;
1340:   VecSet(v,zero);
1341:   if (!a->nz) return(0);
1342:   VecGetArray(v,&x);
1343:   for (i=0; i<ambs; i++) {
1344:     j=ai[i];
1345:     if (aj[j] == i) {    /* if this is a diagonal element */
1346:       row  = i*bs;
1347:       aa_j = aa + j*bs2;
1348:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1349:     }
1350:   }
1351:   VecRestoreArray(v,&x);
1352:   return(0);
1353: }

1355: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1356: {
1357:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1358:   PetscScalar       x;
1359:   const PetscScalar *l,*li,*ri;
1360:   MatScalar         *aa,*v;
1361:   PetscErrorCode    ierr;
1362:   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1363:   const PetscInt    *ai,*aj;
1364:   PetscBool         flg;

1367:   if (ll != rr) {
1368:     VecEqual(ll,rr,&flg);
1369:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1370:   }
1371:   if (!ll) return(0);
1372:   ai  = a->i;
1373:   aj  = a->j;
1374:   aa  = a->a;
1375:   m   = A->rmap->N;
1376:   bs  = A->rmap->bs;
1377:   mbs = a->mbs;
1378:   bs2 = a->bs2;

1380:   VecGetArrayRead(ll,&l);
1381:   VecGetLocalSize(ll,&lm);
1382:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1383:   for (i=0; i<mbs; i++) { /* for each block row */
1384:     M  = ai[i+1] - ai[i];
1385:     li = l + i*bs;
1386:     v  = aa + bs2*ai[i];
1387:     for (j=0; j<M; j++) { /* for each block */
1388:       ri = l + bs*aj[ai[i]+j];
1389:       for (k=0; k<bs; k++) {
1390:         x = ri[k];
1391:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1392:       }
1393:     }
1394:   }
1395:   VecRestoreArrayRead(ll,&l);
1396:   PetscLogFlops(2.0*a->nz);
1397:   return(0);
1398: }

1400: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1401: {
1402:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1405:   info->block_size   = a->bs2;
1406:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1407:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1408:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1409:   info->assemblies   = A->num_ass;
1410:   info->mallocs      = A->info.mallocs;
1411:   info->memory       = ((PetscObject)A)->mem;
1412:   if (A->factortype) {
1413:     info->fill_ratio_given  = A->info.fill_ratio_given;
1414:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1415:     info->factor_mallocs    = A->info.factor_mallocs;
1416:   } else {
1417:     info->fill_ratio_given  = 0;
1418:     info->fill_ratio_needed = 0;
1419:     info->factor_mallocs    = 0;
1420:   }
1421:   return(0);
1422: }


1425: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1426: {
1427:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1431:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1432:   return(0);
1433: }

1435: /*
1436:    This code does not work since it only checks the upper triangular part of
1437:   the matrix. Hence it is not listed in the function table.
1438: */
1439: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1440: {
1441:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1442:   PetscErrorCode  ierr;
1443:   PetscInt        i,j,n,row,col,bs,mbs;
1444:   const PetscInt  *ai,*aj;
1445:   PetscReal       atmp;
1446:   const MatScalar *aa;
1447:   PetscScalar     *x;
1448:   PetscInt        ncols,brow,bcol,krow,kcol;

1451:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1452:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1453:   bs  = A->rmap->bs;
1454:   aa  = a->a;
1455:   ai  = a->i;
1456:   aj  = a->j;
1457:   mbs = a->mbs;

1459:   VecSet(v,0.0);
1460:   VecGetArray(v,&x);
1461:   VecGetLocalSize(v,&n);
1462:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1463:   for (i=0; i<mbs; i++) {
1464:     ncols = ai[1] - ai[0]; ai++;
1465:     brow  = bs*i;
1466:     for (j=0; j<ncols; j++) {
1467:       bcol = bs*(*aj);
1468:       for (kcol=0; kcol<bs; kcol++) {
1469:         col = bcol + kcol;      /* col index */
1470:         for (krow=0; krow<bs; krow++) {
1471:           atmp = PetscAbsScalar(*aa); aa++;
1472:           row  = brow + krow;   /* row index */
1473:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1474:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1475:         }
1476:       }
1477:       aj++;
1478:     }
1479:   }
1480:   VecRestoreArray(v,&x);
1481:   return(0);
1482: }