Actual source code: sbaij2.c

petsc-3.7.3 2016-08-01
Report Typos and Errors
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6: #include <petscblaslapack.h>

 10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 14:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 15:   const PetscInt *idx;
 16:   PetscBT        table_out,table_in;

 19:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 20:   mbs  = a->mbs;
 21:   ai   = a->i;
 22:   aj   = a->j;
 23:   bs   = A->rmap->bs;
 24:   PetscBTCreate(mbs,&table_out);
 25:   PetscMalloc1(mbs+1,&nidx);
 26:   PetscMalloc1(A->rmap->N+1,&nidx2);
 27:   PetscBTCreate(mbs,&table_in);

 29:   for (i=0; i<is_max; i++) { /* for each is */
 30:     isz  = 0;
 31:     PetscBTMemzero(mbs,table_out);

 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 38:     bcol_max = 0;
 39:     for (j=0; j<n; ++j) {
 40:       brow = idx[j]/bs; /* convert the indices into block indices */
 41:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if (!PetscBTLookupSet(table_out,brow)) {
 43:         nidx[isz++] = brow;
 44:         if (bcol_max < brow) bcol_max = brow;
 45:       }
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);

 50:     k = 0;
 51:     for (j=0; j<ov; j++) { /* for each overlap */
 52:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 53:       PetscBTMemzero(mbs,table_in);
 54:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 56:       n = isz;  /* length of the updated is[i] */
 57:       for (brow=0; brow<mbs; brow++) {
 58:         start = ai[brow]; end   = ai[brow+1];
 59:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 60:           for (l = start; l<end; l++) {
 61:             bcol = aj[l];
 62:             if (!PetscBTLookupSet(table_out,bcol)) {
 63:               nidx[isz++] = bcol;
 64:               if (bcol_max < bcol) bcol_max = bcol;
 65:             }
 66:           }
 67:           k++;
 68:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 69:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 70:           for (l = start; l<end; l++) {
 71:             bcol = aj[l];
 72:             if (bcol > bcol_max) break;
 73:             if (PetscBTLookup(table_in,bcol)) {
 74:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 75:               break; /* for l = start; l<end ; l++) */
 76:             }
 77:           }
 78:         }
 79:       }
 80:     } /* for each overlap */

 82:     /* expand the Index Set */
 83:     for (j=0; j<isz; j++) {
 84:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 85:     }
 86:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 87:   }
 88:   PetscBTDestroy(&table_out);
 89:   PetscFree(nidx);
 90:   PetscFree(nidx2);
 91:   PetscBTDestroy(&table_in);
 92:   return(0);
 93: }

 95: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 96:         Zero some ops' to avoid invalid usse */
 99: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
100: {

104:   MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
105:   Bseq->ops->mult                   = 0;
106:   Bseq->ops->multadd                = 0;
107:   Bseq->ops->multtranspose          = 0;
108:   Bseq->ops->multtransposeadd       = 0;
109:   Bseq->ops->lufactor               = 0;
110:   Bseq->ops->choleskyfactor         = 0;
111:   Bseq->ops->lufactorsymbolic       = 0;
112:   Bseq->ops->choleskyfactorsymbolic = 0;
113:   Bseq->ops->getinertia             = 0;
114:   return(0);
115: }

117: /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
120: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
121: {
122:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
124:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
125:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
126:   const PetscInt *irow,*icol;
127:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
128:   PetscInt       *aj = a->j,*ai = a->i;
129:   MatScalar      *mat_a;
130:   Mat            C;
131:   PetscBool      flag;


135:   ISGetIndices(isrow,&irow);
136:   ISGetIndices(iscol,&icol);
137:   ISGetLocalSize(isrow,&nrows);
138:   ISGetLocalSize(iscol,&ncols);

140:   PetscCalloc1(1+oldcols,&smap);
141:   ssmap = smap;
142:   PetscMalloc1(1+nrows,&lens);
143:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
144:   /* determine lens of each row */
145:   for (i=0; i<nrows; i++) {
146:     kstart  = ai[irow[i]];
147:     kend    = kstart + a->ilen[irow[i]];
148:     lens[i] = 0;
149:     for (k=kstart; k<kend; k++) {
150:       if (ssmap[aj[k]]) lens[i]++;
151:     }
152:   }
153:   /* Create and fill new matrix */
154:   if (scall == MAT_REUSE_MATRIX) {
155:     c = (Mat_SeqSBAIJ*)((*B)->data);

157:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
158:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
159:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
160:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
161:     C    = *B;
162:   } else {
163:     MatCreate(PetscObjectComm((PetscObject)A),&C);
164:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
165:     MatSetType(C,((PetscObject)A)->type_name);
166:     MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
167:   }
168:   c = (Mat_SeqSBAIJ*)(C->data);
169:   for (i=0; i<nrows; i++) {
170:     row      = irow[i];
171:     kstart   = ai[row];
172:     kend     = kstart + a->ilen[row];
173:     mat_i    = c->i[i];
174:     mat_j    = c->j + mat_i;
175:     mat_a    = c->a + mat_i*bs2;
176:     mat_ilen = c->ilen + i;
177:     for (k=kstart; k<kend; k++) {
178:       if ((tcol=ssmap[a->j[k]])) {
179:         *mat_j++ = tcol - 1;
180:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
181:         mat_a   += bs2;
182:         (*mat_ilen)++;
183:       }
184:     }
185:   }
186:   /* sort */
187:   {
188:     MatScalar *work;

190:     PetscMalloc1(bs2,&work);
191:     for (i=0; i<nrows; i++) {
192:       PetscInt ilen;
193:       mat_i = c->i[i];
194:       mat_j = c->j + mat_i;
195:       mat_a = c->a + mat_i*bs2;
196:       ilen  = c->ilen[i];
197:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
198:     }
199:     PetscFree(work);
200:   }

202:   /* Free work space */
203:   ISRestoreIndices(iscol,&icol);
204:   PetscFree(smap);
205:   PetscFree(lens);
206:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
207:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

209:   ISRestoreIndices(isrow,&irow);
210:   *B   = C;
211:   return(0);
212: }

216: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
217: {
218:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
219:   IS             is1,is2;
221:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
222:   const PetscInt *irow,*icol;

225:   ISGetIndices(isrow,&irow);
226:   ISGetIndices(iscol,&icol);
227:   ISGetLocalSize(isrow,&nrows);
228:   ISGetLocalSize(iscol,&ncols);

230:   /* Verify if the indices corespond to each element in a block
231:    and form the IS with compressed IS */
232:   maxmnbs = PetscMax(a->mbs,a->nbs);
233:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
234:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
235:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
236:   for (i=0; i<a->mbs; i++) {
237:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
238:   }
239:   count = 0;
240:   for (i=0; i<nrows; i++) {
241:     PetscInt j = irow[i] / bs;
242:     if ((vary[j]--)==bs) iary[count++] = j;
243:   }
244:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

246:   PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
247:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
248:   for (i=0; i<a->nbs; i++) {
249:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
250:   }
251:   count = 0;
252:   for (i=0; i<ncols; i++) {
253:     PetscInt j = icol[i] / bs;
254:     if ((vary[j]--)==bs) iary[count++] = j;
255:   }
256:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
257:   ISRestoreIndices(isrow,&irow);
258:   ISRestoreIndices(iscol,&icol);
259:   PetscFree2(vary,iary);

261:   MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
262:   ISDestroy(&is1);
263:   ISDestroy(&is2);

265:   if (isrow != iscol) {
266:     PetscBool isequal;
267:     ISEqual(isrow,iscol,&isequal);
268:     if (!isequal) {
269:       MatSeqSBAIJZeroOps_Private(*B);
270:     }
271:   }
272:   return(0);
273: }

277: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
278: {
280:   PetscInt       i;

283:   if (scall == MAT_INITIAL_MATRIX) {
284:     PetscMalloc1(n+1,B);
285:   }

287:   for (i=0; i<n; i++) {
288:     MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
289:   }
290:   return(0);
291: }

293: /* -------------------------------------------------------*/
294: /* Should check that shapes of vectors and matrices match */
295: /* -------------------------------------------------------*/

299: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
300: {
301:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
302:   PetscScalar       *z,x1,x2,zero=0.0;
303:   const PetscScalar *x,*xb;
304:   const MatScalar   *v;
305:   PetscErrorCode    ierr;
306:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
307:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
308:   PetscInt          nonzerorow=0;

311:   VecSet(zz,zero);
312:   VecGetArrayRead(xx,&x);
313:   VecGetArray(zz,&z);

315:   v  = a->a;
316:   xb = x;

318:   for (i=0; i<mbs; i++) {
319:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
320:     x1          = xb[0]; x2 = xb[1];
321:     ib          = aj + *ai;
322:     jmin        = 0;
323:     nonzerorow += (n>0);
324:     if (*ib == i) {     /* (diag of A)*x */
325:       z[2*i]   += v[0]*x1 + v[2]*x2;
326:       z[2*i+1] += v[2]*x1 + v[3]*x2;
327:       v        += 4; jmin++;
328:     }
329:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
330:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
331:     for (j=jmin; j<n; j++) {
332:       /* (strict lower triangular part of A)*x  */
333:       cval       = ib[j]*2;
334:       z[cval]   += v[0]*x1 + v[1]*x2;
335:       z[cval+1] += v[2]*x1 + v[3]*x2;
336:       /* (strict upper triangular part of A)*x  */
337:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
338:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
339:       v        += 4;
340:     }
341:     xb +=2; ai++;
342:   }

344:   VecRestoreArrayRead(xx,&x);
345:   VecRestoreArray(zz,&z);
346:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
347:   return(0);
348: }

352: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
353: {
354:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
355:   PetscScalar       *z,x1,x2,x3,zero=0.0;
356:   const PetscScalar *x,*xb;
357:   const MatScalar   *v;
358:   PetscErrorCode    ierr;
359:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
360:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
361:   PetscInt          nonzerorow=0;

364:   VecSet(zz,zero);
365:   VecGetArrayRead(xx,&x);
366:   VecGetArray(zz,&z);

368:   v  = a->a;
369:   xb = x;

371:   for (i=0; i<mbs; i++) {
372:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
373:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
374:     ib          = aj + *ai;
375:     jmin        = 0;
376:     nonzerorow += (n>0);
377:     if (*ib == i) {     /* (diag of A)*x */
378:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
379:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
380:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
381:       v        += 9; jmin++;
382:     }
383:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
384:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
385:     for (j=jmin; j<n; j++) {
386:       /* (strict lower triangular part of A)*x  */
387:       cval       = ib[j]*3;
388:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
389:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
390:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
391:       /* (strict upper triangular part of A)*x  */
392:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
393:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
394:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
395:       v        += 9;
396:     }
397:     xb +=3; ai++;
398:   }

400:   VecRestoreArrayRead(xx,&x);
401:   VecRestoreArray(zz,&z);
402:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
403:   return(0);
404: }

408: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
409: {
410:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411:   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
412:   const PetscScalar *x,*xb;
413:   const MatScalar   *v;
414:   PetscErrorCode    ierr;
415:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
417:   PetscInt          nonzerorow = 0;

420:   VecSet(zz,zero);
421:   VecGetArrayRead(xx,&x);
422:   VecGetArray(zz,&z);

424:   v  = a->a;
425:   xb = x;

427:   for (i=0; i<mbs; i++) {
428:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
429:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
430:     ib          = aj + *ai;
431:     jmin        = 0;
432:     nonzerorow += (n>0);
433:     if (*ib == i) {     /* (diag of A)*x */
434:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
435:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
436:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
437:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
438:       v        += 16; jmin++;
439:     }
440:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
441:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
442:     for (j=jmin; j<n; j++) {
443:       /* (strict lower triangular part of A)*x  */
444:       cval       = ib[j]*4;
445:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
446:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
447:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
448:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
449:       /* (strict upper triangular part of A)*x  */
450:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
451:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
452:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
453:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
454:       v        += 16;
455:     }
456:     xb +=4; ai++;
457:   }

459:   VecRestoreArrayRead(xx,&x);
460:   VecRestoreArray(zz,&z);
461:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
462:   return(0);
463: }

467: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
468: {
469:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
470:   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
471:   const PetscScalar *x,*xb;
472:   const MatScalar   *v;
473:   PetscErrorCode    ierr;
474:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
475:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
476:   PetscInt          nonzerorow=0;

479:   VecSet(zz,zero);
480:   VecGetArrayRead(xx,&x);
481:   VecGetArray(zz,&z);

483:   v  = a->a;
484:   xb = x;

486:   for (i=0; i<mbs; i++) {
487:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
488:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
489:     ib          = aj + *ai;
490:     jmin        = 0;
491:     nonzerorow += (n>0);
492:     if (*ib == i) {      /* (diag of A)*x */
493:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
494:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
495:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
496:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
497:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
498:       v        += 25; jmin++;
499:     }
500:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
501:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502:     for (j=jmin; j<n; j++) {
503:       /* (strict lower triangular part of A)*x  */
504:       cval       = ib[j]*5;
505:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
506:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
507:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
508:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
509:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
510:       /* (strict upper triangular part of A)*x  */
511:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
512:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
513:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
514:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
515:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
516:       v        += 25;
517:     }
518:     xb +=5; ai++;
519:   }

521:   VecRestoreArrayRead(xx,&x);
522:   VecRestoreArray(zz,&z);
523:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
524:   return(0);
525: }


530: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
531: {
532:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
533:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
534:   const PetscScalar *x,*xb;
535:   const MatScalar   *v;
536:   PetscErrorCode    ierr;
537:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
538:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
539:   PetscInt          nonzerorow=0;

542:   VecSet(zz,zero);
543:   VecGetArrayRead(xx,&x);
544:   VecGetArray(zz,&z);

546:   v  = a->a;
547:   xb = x;

549:   for (i=0; i<mbs; i++) {
550:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
551:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
552:     ib          = aj + *ai;
553:     jmin        = 0;
554:     nonzerorow += (n>0);
555:     if (*ib == i) {      /* (diag of A)*x */
556:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
557:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
558:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
559:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
560:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
561:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
562:       v        += 36; jmin++;
563:     }
564:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
565:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
566:     for (j=jmin; j<n; j++) {
567:       /* (strict lower triangular part of A)*x  */
568:       cval       = ib[j]*6;
569:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
570:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
571:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
572:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
573:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
574:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
575:       /* (strict upper triangular part of A)*x  */
576:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
577:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
578:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
579:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
580:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
581:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
582:       v        += 36;
583:     }
584:     xb +=6; ai++;
585:   }

587:   VecRestoreArrayRead(xx,&x);
588:   VecRestoreArray(zz,&z);
589:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
590:   return(0);
591: }
594: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
595: {
596:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
597:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
598:   const PetscScalar *x,*xb;
599:   const MatScalar   *v;
600:   PetscErrorCode    ierr;
601:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
602:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
603:   PetscInt          nonzerorow=0;

606:   VecSet(zz,zero);
607:   VecGetArrayRead(xx,&x);
608:   VecGetArray(zz,&z);

610:   v  = a->a;
611:   xb = x;

613:   for (i=0; i<mbs; i++) {
614:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
615:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
616:     ib          = aj + *ai;
617:     jmin        = 0;
618:     nonzerorow += (n>0);
619:     if (*ib == i) {      /* (diag of A)*x */
620:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
621:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
622:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
623:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
624:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
625:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
626:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
627:       v        += 49; jmin++;
628:     }
629:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
630:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
631:     for (j=jmin; j<n; j++) {
632:       /* (strict lower triangular part of A)*x  */
633:       cval       = ib[j]*7;
634:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
635:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
636:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
637:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
638:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
639:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
640:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
641:       /* (strict upper triangular part of A)*x  */
642:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
643:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
644:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
645:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
646:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
647:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
648:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
649:       v       += 49;
650:     }
651:     xb +=7; ai++;
652:   }
653:   VecRestoreArrayRead(xx,&x);
654:   VecRestoreArray(zz,&z);
655:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
656:   return(0);
657: }

659: /*
660:     This will not work with MatScalar == float because it calls the BLAS
661: */
664: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
665: {
666:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
667:   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
668:   const PetscScalar *x,*x_ptr,*xb;
669:   const MatScalar   *v;
670:   PetscErrorCode    ierr;
671:   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
672:   const PetscInt    *idx,*aj,*ii;
673:   PetscInt          nonzerorow=0;

676:   VecSet(zz,zero);
677:   VecGetArrayRead(xx,&x);x_ptr = x;
678:   VecGetArray(zz,&z); z_ptr=z;

680:   aj = a->j;
681:   v  = a->a;
682:   ii = a->i;

684:   if (!a->mult_work) {
685:     PetscMalloc1(A->rmap->N+1,&a->mult_work);
686:   }
687:   work = a->mult_work;

689:   for (i=0; i<mbs; i++) {
690:     n           = ii[1] - ii[0]; ncols = n*bs;
691:     workt       = work; idx=aj+ii[0];
692:     nonzerorow += (n>0);

694:     /* upper triangular part */
695:     for (j=0; j<n; j++) {
696:       xb = x_ptr + bs*(*idx++);
697:       for (k=0; k<bs; k++) workt[k] = xb[k];
698:       workt += bs;
699:     }
700:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
701:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

703:     /* strict lower triangular part */
704:     idx = aj+ii[0];
705:     if (*idx == i) {
706:       ncols -= bs; v += bs2; idx++; n--;
707:     }

709:     if (ncols > 0) {
710:       workt = work;
711:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
712:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
713:       for (j=0; j<n; j++) {
714:         zb = z_ptr + bs*(*idx++);
715:         for (k=0; k<bs; k++) zb[k] += workt[k];
716:         workt += bs;
717:       }
718:     }
719:     x += bs; v += n*bs2; z += bs; ii++;
720:   }

722:   VecRestoreArrayRead(xx,&x);
723:   VecRestoreArray(zz,&z);
724:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
725:   return(0);
726: }

730: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
731: {
732:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
733:   PetscScalar       *z,x1;
734:   const PetscScalar *x,*xb;
735:   const MatScalar   *v;
736:   PetscErrorCode    ierr;
737:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
738:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
739:   PetscInt          nonzerorow=0;

742:   VecCopy(yy,zz);
743:   VecGetArrayRead(xx,&x);
744:   VecGetArray(zz,&z);
745:   v    = a->a;
746:   xb   = x;

748:   for (i=0; i<mbs; i++) {
749:     n           = ai[1] - ai[0]; /* length of i_th row of A */
750:     x1          = xb[0];
751:     ib          = aj + *ai;
752:     jmin        = 0;
753:     nonzerorow += (n>0);
754:     if (*ib == i) {            /* (diag of A)*x */
755:       z[i] += *v++ * x[*ib++]; jmin++;
756:     }
757:     for (j=jmin; j<n; j++) {
758:       cval    = *ib;
759:       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
760:       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
761:     }
762:     xb++; ai++;
763:   }

765:   VecRestoreArrayRead(xx,&x);
766:   VecRestoreArray(zz,&z);

768:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
769:   return(0);
770: }

774: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
775: {
776:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
777:   PetscScalar       *z,x1,x2;
778:   const PetscScalar *x,*xb;
779:   const MatScalar   *v;
780:   PetscErrorCode    ierr;
781:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
782:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
783:   PetscInt          nonzerorow=0;

786:   VecCopy(yy,zz);
787:   VecGetArrayRead(xx,&x);
788:   VecGetArray(zz,&z);

790:   v  = a->a;
791:   xb = x;

793:   for (i=0; i<mbs; i++) {
794:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
795:     x1          = xb[0]; x2 = xb[1];
796:     ib          = aj + *ai;
797:     jmin        = 0;
798:     nonzerorow += (n>0);
799:     if (*ib == i) {      /* (diag of A)*x */
800:       z[2*i]   += v[0]*x1 + v[2]*x2;
801:       z[2*i+1] += v[2]*x1 + v[3]*x2;
802:       v        += 4; jmin++;
803:     }
804:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
805:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
806:     for (j=jmin; j<n; j++) {
807:       /* (strict lower triangular part of A)*x  */
808:       cval       = ib[j]*2;
809:       z[cval]   += v[0]*x1 + v[1]*x2;
810:       z[cval+1] += v[2]*x1 + v[3]*x2;
811:       /* (strict upper triangular part of A)*x  */
812:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
813:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
814:       v        += 4;
815:     }
816:     xb +=2; ai++;
817:   }
818:   VecRestoreArrayRead(xx,&x);
819:   VecRestoreArray(zz,&z);

821:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
822:   return(0);
823: }

827: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828: {
829:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
830:   PetscScalar       *z,x1,x2,x3;
831:   const PetscScalar *x,*xb;
832:   const MatScalar   *v;
833:   PetscErrorCode    ierr;
834:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
835:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
836:   PetscInt          nonzerorow=0;

839:   VecCopy(yy,zz);
840:   VecGetArrayRead(xx,&x);
841:   VecGetArray(zz,&z);

843:   v  = a->a;
844:   xb = x;

846:   for (i=0; i<mbs; i++) {
847:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
848:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
849:     ib          = aj + *ai;
850:     jmin        = 0;
851:     nonzerorow += (n>0);
852:     if (*ib == i) {     /* (diag of A)*x */
853:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
854:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
855:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
856:       v        += 9; jmin++;
857:     }
858:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
860:     for (j=jmin; j<n; j++) {
861:       /* (strict lower triangular part of A)*x  */
862:       cval       = ib[j]*3;
863:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
864:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
865:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
866:       /* (strict upper triangular part of A)*x  */
867:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
868:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
869:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
870:       v        += 9;
871:     }
872:     xb +=3; ai++;
873:   }

875:   VecRestoreArrayRead(xx,&x);
876:   VecRestoreArray(zz,&z);

878:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
879:   return(0);
880: }

884: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
885: {
886:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
887:   PetscScalar       *z,x1,x2,x3,x4;
888:   const PetscScalar *x,*xb;
889:   const MatScalar   *v;
890:   PetscErrorCode    ierr;
891:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
892:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
893:   PetscInt          nonzerorow=0;

896:   VecCopy(yy,zz);
897:   VecGetArrayRead(xx,&x);
898:   VecGetArray(zz,&z);

900:   v  = a->a;
901:   xb = x;

903:   for (i=0; i<mbs; i++) {
904:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
905:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
906:     ib          = aj + *ai;
907:     jmin        = 0;
908:     nonzerorow += (n>0);
909:     if (*ib == i) {      /* (diag of A)*x */
910:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
911:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
912:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
913:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
914:       v        += 16; jmin++;
915:     }
916:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
917:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
918:     for (j=jmin; j<n; j++) {
919:       /* (strict lower triangular part of A)*x  */
920:       cval       = ib[j]*4;
921:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
922:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
923:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
924:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
925:       /* (strict upper triangular part of A)*x  */
926:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
927:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
928:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
929:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
930:       v        += 16;
931:     }
932:     xb +=4; ai++;
933:   }

935:   VecRestoreArrayRead(xx,&x);
936:   VecRestoreArray(zz,&z);

938:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
939:   return(0);
940: }

944: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
945: {
946:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
947:   PetscScalar       *z,x1,x2,x3,x4,x5;
948:   const PetscScalar *x,*xb;
949:   const MatScalar   *v;
950:   PetscErrorCode    ierr;
951:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
952:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
953:   PetscInt          nonzerorow=0;

956:   VecCopy(yy,zz);
957:   VecGetArrayRead(xx,&x);
958:   VecGetArray(zz,&z);

960:   v  = a->a;
961:   xb = x;

963:   for (i=0; i<mbs; i++) {
964:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
965:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
966:     ib          = aj + *ai;
967:     jmin        = 0;
968:     nonzerorow += (n>0);
969:     if (*ib == i) {      /* (diag of A)*x */
970:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
971:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
972:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
973:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
974:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
975:       v        += 25; jmin++;
976:     }
977:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979:     for (j=jmin; j<n; j++) {
980:       /* (strict lower triangular part of A)*x  */
981:       cval       = ib[j]*5;
982:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
983:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
984:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
985:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
986:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
987:       /* (strict upper triangular part of A)*x  */
988:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
989:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
990:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
991:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
992:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
993:       v        += 25;
994:     }
995:     xb +=5; ai++;
996:   }

998:   VecRestoreArrayRead(xx,&x);
999:   VecRestoreArray(zz,&z);

1001:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
1002:   return(0);
1003: }
1006: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1007: {
1008:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1009:   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1010:   const PetscScalar *x,*xb;
1011:   const MatScalar   *v;
1012:   PetscErrorCode    ierr;
1013:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1014:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1015:   PetscInt          nonzerorow=0;

1018:   VecCopy(yy,zz);
1019:   VecGetArrayRead(xx,&x);
1020:   VecGetArray(zz,&z);

1022:   v  = a->a;
1023:   xb = x;

1025:   for (i=0; i<mbs; i++) {
1026:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1027:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1028:     ib          = aj + *ai;
1029:     jmin        = 0;
1030:     nonzerorow += (n>0);
1031:     if (*ib == i) {     /* (diag of A)*x */
1032:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1033:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1034:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1035:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1036:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1037:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1038:       v        += 36; jmin++;
1039:     }
1040:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1041:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1042:     for (j=jmin; j<n; j++) {
1043:       /* (strict lower triangular part of A)*x  */
1044:       cval       = ib[j]*6;
1045:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1046:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1047:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1048:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1049:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1050:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1051:       /* (strict upper triangular part of A)*x  */
1052:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1053:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1054:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1055:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1056:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1057:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1058:       v        += 36;
1059:     }
1060:     xb +=6; ai++;
1061:   }

1063:   VecRestoreArrayRead(xx,&x);
1064:   VecRestoreArray(zz,&z);

1066:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1067:   return(0);
1068: }

1072: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1073: {
1074:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1075:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1076:   const PetscScalar *x,*xb;
1077:   const MatScalar   *v;
1078:   PetscErrorCode    ierr;
1079:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1080:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1081:   PetscInt          nonzerorow=0;

1084:   VecCopy(yy,zz);
1085:   VecGetArrayRead(xx,&x);
1086:   VecGetArray(zz,&z);

1088:   v  = a->a;
1089:   xb = x;

1091:   for (i=0; i<mbs; i++) {
1092:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1093:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1094:     ib          = aj + *ai;
1095:     jmin        = 0;
1096:     nonzerorow += (n>0);
1097:     if (*ib == i) {     /* (diag of A)*x */
1098:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1099:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1100:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1101:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1102:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1103:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1104:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1105:       v        += 49; jmin++;
1106:     }
1107:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1108:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1109:     for (j=jmin; j<n; j++) {
1110:       /* (strict lower triangular part of A)*x  */
1111:       cval       = ib[j]*7;
1112:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1113:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1114:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1115:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1116:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1117:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1118:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1119:       /* (strict upper triangular part of A)*x  */
1120:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1121:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1122:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1123:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1124:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1125:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1126:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1127:       v       += 49;
1128:     }
1129:     xb +=7; ai++;
1130:   }

1132:   VecRestoreArrayRead(xx,&x);
1133:   VecRestoreArray(zz,&z);

1135:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1136:   return(0);
1137: }

1141: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1142: {
1143:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1144:   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1145:   const PetscScalar *x,*x_ptr,*xb;
1146:   const MatScalar   *v;
1147:   PetscErrorCode    ierr;
1148:   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1149:   const PetscInt    *idx,*aj,*ii;
1150:   PetscInt          nonzerorow=0;

1153:   VecCopy(yy,zz);
1154:   VecGetArrayRead(xx,&x); x_ptr=x;
1155:   VecGetArray(zz,&z); z_ptr=z;

1157:   aj = a->j;
1158:   v  = a->a;
1159:   ii = a->i;

1161:   if (!a->mult_work) {
1162:     PetscMalloc1(A->rmap->n+1,&a->mult_work);
1163:   }
1164:   work = a->mult_work;


1167:   for (i=0; i<mbs; i++) {
1168:     n           = ii[1] - ii[0]; ncols = n*bs;
1169:     workt       = work; idx=aj+ii[0];
1170:     nonzerorow += (n>0);

1172:     /* upper triangular part */
1173:     for (j=0; j<n; j++) {
1174:       xb = x_ptr + bs*(*idx++);
1175:       for (k=0; k<bs; k++) workt[k] = xb[k];
1176:       workt += bs;
1177:     }
1178:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1179:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1181:     /* strict lower triangular part */
1182:     idx = aj+ii[0];
1183:     if (*idx == i) {
1184:       ncols -= bs; v += bs2; idx++; n--;
1185:     }
1186:     if (ncols > 0) {
1187:       workt = work;
1188:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
1189:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1190:       for (j=0; j<n; j++) {
1191:         zb = z_ptr + bs*(*idx++);
1192:         for (k=0; k<bs; k++) zb[k] += workt[k];
1193:         workt += bs;
1194:       }
1195:     }

1197:     x += bs; v += n*bs2; z += bs; ii++;
1198:   }

1200:   VecRestoreArrayRead(xx,&x);
1201:   VecRestoreArray(zz,&z);

1203:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1204:   return(0);
1205: }

1209: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1210: {
1211:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1212:   PetscScalar    oalpha = alpha;
1214:   PetscBLASInt   one = 1,totalnz;

1217:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1218:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1219:   PetscLogFlops(totalnz);
1220:   return(0);
1221: }

1225: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1226: {
1227:   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1228:   const MatScalar *v       = a->a;
1229:   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1230:   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1231:   PetscErrorCode  ierr;
1232:   const PetscInt  *aj=a->j,*col;

1235:   if (type == NORM_FROBENIUS) {
1236:     for (k=0; k<mbs; k++) {
1237:       jmin = a->i[k]; jmax = a->i[k+1];
1238:       col  = aj + jmin;
1239:       if (*col == k) {         /* diagonal block */
1240:         for (i=0; i<bs2; i++) {
1241:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242:         }
1243:         jmin++;
1244:       }
1245:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1246:         for (i=0; i<bs2; i++) {
1247:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1248:         }
1249:       }
1250:     }
1251:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1252:     PetscLogFlops(2*bs2*a->nz);
1253:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1254:     PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1255:     for (i=0; i<mbs; i++) jl[i] = mbs;
1256:     il[0] = 0;

1258:     *norm = 0.0;
1259:     for (k=0; k<mbs; k++) { /* k_th block row */
1260:       for (j=0; j<bs; j++) sum[j]=0.0;
1261:       /*-- col sum --*/
1262:       i = jl[k]; /* first |A(i,k)| to be added */
1263:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1264:                   at step k */
1265:       while (i<mbs) {
1266:         nexti = jl[i];  /* next block row to be added */
1267:         ik    = il[i];  /* block index of A(i,k) in the array a */
1268:         for (j=0; j<bs; j++) {
1269:           v = a->a + ik*bs2 + j*bs;
1270:           for (k1=0; k1<bs; k1++) {
1271:             sum[j] += PetscAbsScalar(*v); v++;
1272:           }
1273:         }
1274:         /* update il, jl */
1275:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1276:         jmax = a->i[i+1];
1277:         if (jmin < jmax) {
1278:           il[i] = jmin;
1279:           j     = a->j[jmin];
1280:           jl[i] = jl[j]; jl[j]=i;
1281:         }
1282:         i = nexti;
1283:       }
1284:       /*-- row sum --*/
1285:       jmin = a->i[k]; jmax = a->i[k+1];
1286:       for (i=jmin; i<jmax; i++) {
1287:         for (j=0; j<bs; j++) {
1288:           v = a->a + i*bs2 + j;
1289:           for (k1=0; k1<bs; k1++) {
1290:             sum[j] += PetscAbsScalar(*v); v += bs;
1291:           }
1292:         }
1293:       }
1294:       /* add k_th block row to il, jl */
1295:       col = aj+jmin;
1296:       if (*col == k) jmin++;
1297:       if (jmin < jmax) {
1298:         il[k] = jmin;
1299:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1300:       }
1301:       for (j=0; j<bs; j++) {
1302:         if (sum[j] > *norm) *norm = sum[j];
1303:       }
1304:     }
1305:     PetscFree3(sum,il,jl);
1306:     PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1307:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1308:   return(0);
1309: }

1313: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1314: {
1315:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1319:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1320:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1321:     *flg = PETSC_FALSE;
1322:     return(0);
1323:   }

1325:   /* if the a->i are the same */
1326:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1327:   if (!*flg) return(0);

1329:   /* if a->j are the same */
1330:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1331:   if (!*flg) return(0);

1333:   /* if a->a are the same */
1334:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1335:   return(0);
1336: }

1340: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1341: {
1342:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1343:   PetscErrorCode  ierr;
1344:   PetscInt        i,j,k,row,bs,ambs,bs2;
1345:   const PetscInt  *ai,*aj;
1346:   PetscScalar     *x,zero = 0.0;
1347:   const MatScalar *aa,*aa_j;

1350:   bs = A->rmap->bs;
1351:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");

1353:   aa   = a->a;
1354:   ambs = a->mbs;

1356:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1357:     PetscInt *diag=a->diag;
1358:     aa   = a->a;
1359:     ambs = a->mbs;
1360:     VecGetArray(v,&x);
1361:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1362:     VecRestoreArray(v,&x);
1363:     return(0);
1364:   }

1366:   ai   = a->i;
1367:   aj   = a->j;
1368:   bs2  = a->bs2;
1369:   VecSet(v,zero);
1370:   VecGetArray(v,&x);
1371:   for (i=0; i<ambs; i++) {
1372:     j=ai[i];
1373:     if (aj[j] == i) {    /* if this is a diagonal element */
1374:       row  = i*bs;
1375:       aa_j = aa + j*bs2;
1376:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1377:     }
1378:   }
1379:   VecRestoreArray(v,&x);
1380:   return(0);
1381: }

1385: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1386: {
1387:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1388:   PetscScalar       x;
1389:   const PetscScalar *l,*li,*ri;
1390:   MatScalar         *aa,*v;
1391:   PetscErrorCode    ierr;
1392:   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1393:   PetscBool         flg;

1396:   if (ll != rr) {
1397:     VecEqual(ll,rr,&flg);
1398:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1399:   }
1400:   if (!ll) return(0);
1401:   ai  = a->i;
1402:   aj  = a->j;
1403:   aa  = a->a;
1404:   m   = A->rmap->N;
1405:   bs  = A->rmap->bs;
1406:   mbs = a->mbs;
1407:   bs2 = a->bs2;

1409:   VecGetArrayRead(ll,&l);
1410:   VecGetLocalSize(ll,&lm);
1411:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1412:   for (i=0; i<mbs; i++) { /* for each block row */
1413:     M  = ai[i+1] - ai[i];
1414:     li = l + i*bs;
1415:     v  = aa + bs2*ai[i];
1416:     for (j=0; j<M; j++) { /* for each block */
1417:       ri = l + bs*aj[ai[i]+j];
1418:       for (k=0; k<bs; k++) {
1419:         x = ri[k];
1420:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1421:       }
1422:     }
1423:   }
1424:   VecRestoreArrayRead(ll,&l);
1425:   PetscLogFlops(2.0*a->nz);
1426:   return(0);
1427: }

1431: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1432: {
1433:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1436:   info->block_size   = a->bs2;
1437:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1438:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1439:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1440:   info->assemblies   = A->num_ass;
1441:   info->mallocs      = A->info.mallocs;
1442:   info->memory       = ((PetscObject)A)->mem;
1443:   if (A->factortype) {
1444:     info->fill_ratio_given  = A->info.fill_ratio_given;
1445:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1446:     info->factor_mallocs    = A->info.factor_mallocs;
1447:   } else {
1448:     info->fill_ratio_given  = 0;
1449:     info->fill_ratio_needed = 0;
1450:     info->factor_mallocs    = 0;
1451:   }
1452:   return(0);
1453: }


1458: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1459: {
1460:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1464:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1465:   return(0);
1466: }

1470: /*
1471:    This code does not work since it only checks the upper triangular part of
1472:   the matrix. Hence it is not listed in the function table.
1473: */
1474: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1475: {
1476:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1477:   PetscErrorCode  ierr;
1478:   PetscInt        i,j,n,row,col,bs,mbs;
1479:   const PetscInt  *ai,*aj;
1480:   PetscReal       atmp;
1481:   const MatScalar *aa;
1482:   PetscScalar     *x;
1483:   PetscInt        ncols,brow,bcol,krow,kcol;

1486:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1487:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1488:   bs  = A->rmap->bs;
1489:   aa  = a->a;
1490:   ai  = a->i;
1491:   aj  = a->j;
1492:   mbs = a->mbs;

1494:   VecSet(v,0.0);
1495:   VecGetArray(v,&x);
1496:   VecGetLocalSize(v,&n);
1497:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1498:   for (i=0; i<mbs; i++) {
1499:     ncols = ai[1] - ai[0]; ai++;
1500:     brow  = bs*i;
1501:     for (j=0; j<ncols; j++) {
1502:       bcol = bs*(*aj);
1503:       for (kcol=0; kcol<bs; kcol++) {
1504:         col = bcol + kcol;      /* col index */
1505:         for (krow=0; krow<bs; krow++) {
1506:           atmp = PetscAbsScalar(*aa); aa++;
1507:           row  = brow + krow;   /* row index */
1508:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1509:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1510:         }
1511:       }
1512:       aj++;
1513:     }
1514:   }
1515:   VecRestoreArray(v,&x);
1516:   return(0);
1517: }