Actual source code: sbaij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5: #include <petsc/private/kernels/blockinvert.h>
  6: #include <petscbt.h>
  7: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 12:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 13:   const PetscInt *idx;
 14:   PetscBT        table_out,table_in;

 17:   mbs  = a->mbs;
 18:   ai   = a->i;
 19:   aj   = a->j;
 20:   bs   = A->rmap->bs;
 21:   PetscBTCreate(mbs,&table_out);
 22:   PetscMalloc1(mbs+1,&nidx);
 23:   PetscMalloc1(A->rmap->N+1,&nidx2);
 24:   PetscBTCreate(mbs,&table_in);

 26:   for (i=0; i<is_max; i++) { /* for each is */
 27:     isz  = 0;
 28:     PetscBTMemzero(mbs,table_out);

 30:     /* Extract the indices, assume there can be duplicate entries */
 31:     ISGetIndices(is[i],&idx);
 32:     ISGetLocalSize(is[i],&n);

 34:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 35:     bcol_max = 0;
 36:     for (j=0; j<n; ++j) {
 37:       brow = idx[j]/bs; /* convert the indices into block indices */
 39:       if (!PetscBTLookupSet(table_out,brow)) {
 40:         nidx[isz++] = brow;
 41:         if (bcol_max < brow) bcol_max = brow;
 42:       }
 43:     }
 44:     ISRestoreIndices(is[i],&idx);
 45:     ISDestroy(&is[i]);

 47:     k = 0;
 48:     for (j=0; j<ov; j++) { /* for each overlap */
 49:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 50:       PetscBTMemzero(mbs,table_in);
 51:       for (l=k; l<isz; l++) PetscBTSet(table_in,nidx[l]);

 53:       n = isz;  /* length of the updated is[i] */
 54:       for (brow=0; brow<mbs; brow++) {
 55:         start = ai[brow]; end   = ai[brow+1];
 56:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 57:           for (l = start; l<end; l++) {
 58:             bcol = aj[l];
 59:             if (!PetscBTLookupSet(table_out,bcol)) {
 60:               nidx[isz++] = bcol;
 61:               if (bcol_max < bcol) bcol_max = bcol;
 62:             }
 63:           }
 64:           k++;
 65:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 66:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 67:           for (l = start; l<end; l++) {
 68:             bcol = aj[l];
 69:             if (bcol > bcol_max) break;
 70:             if (PetscBTLookup(table_in,bcol)) {
 71:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 72:               break; /* for l = start; l<end ; l++) */
 73:             }
 74:           }
 75:         }
 76:       }
 77:     } /* for each overlap */

 79:     /* expand the Index Set */
 80:     for (j=0; j<isz; j++) {
 81:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 82:     }
 83:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 84:   }
 85:   PetscBTDestroy(&table_out);
 86:   PetscFree(nidx);
 87:   PetscFree(nidx2);
 88:   PetscBTDestroy(&table_in);
 89:   return 0;
 90: }

 92: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 93:         Zero some ops' to avoid invalid usse */
 94: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 95: {
 96:   MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
 97:   Bseq->ops->mult                   = NULL;
 98:   Bseq->ops->multadd                = NULL;
 99:   Bseq->ops->multtranspose          = NULL;
100:   Bseq->ops->multtransposeadd       = NULL;
101:   Bseq->ops->lufactor               = NULL;
102:   Bseq->ops->choleskyfactor         = NULL;
103:   Bseq->ops->lufactorsymbolic       = NULL;
104:   Bseq->ops->choleskyfactorsymbolic = NULL;
105:   Bseq->ops->getinertia             = NULL;
106:   return 0;
107: }

109: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
110: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
111: {
112:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
113:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
114:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
115:   const PetscInt *irow,*icol;
116:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
117:   PetscInt       *aj = a->j,*ai = a->i;
118:   MatScalar      *mat_a;
119:   Mat            C;
120:   PetscBool      flag;


123:   ISGetIndices(isrow,&irow);
124:   ISGetIndices(iscol,&icol);
125:   ISGetLocalSize(isrow,&nrows);
126:   ISGetLocalSize(iscol,&ncols);

128:   PetscCalloc1(1+oldcols,&smap);
129:   ssmap = smap;
130:   PetscMalloc1(1+nrows,&lens);
131:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
132:   /* determine lens of each row */
133:   for (i=0; i<nrows; i++) {
134:     kstart  = ai[irow[i]];
135:     kend    = kstart + a->ilen[irow[i]];
136:     lens[i] = 0;
137:     for (k=kstart; k<kend; k++) {
138:       if (ssmap[aj[k]]) lens[i]++;
139:     }
140:   }
141:   /* Create and fill new matrix */
142:   if (scall == MAT_REUSE_MATRIX) {
143:     c = (Mat_SeqSBAIJ*)((*B)->data);

146:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
148:     PetscArrayzero(c->ilen,c->mbs);
149:     C    = *B;
150:   } else {
151:     MatCreate(PetscObjectComm((PetscObject)A),&C);
152:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
153:     MatSetType(C,((PetscObject)A)->type_name);
154:     MatSeqSBAIJSetPreallocation(C,bs,0,lens);
155:   }
156:   c = (Mat_SeqSBAIJ*)(C->data);
157:   for (i=0; i<nrows; i++) {
158:     row      = irow[i];
159:     kstart   = ai[row];
160:     kend     = kstart + a->ilen[row];
161:     mat_i    = c->i[i];
162:     mat_j    = c->j + mat_i;
163:     mat_a    = c->a + mat_i*bs2;
164:     mat_ilen = c->ilen + i;
165:     for (k=kstart; k<kend; k++) {
166:       if ((tcol=ssmap[a->j[k]])) {
167:         *mat_j++ = tcol - 1;
168:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
169:         mat_a   += bs2;
170:         (*mat_ilen)++;
171:       }
172:     }
173:   }
174:   /* sort */
175:   {
176:     MatScalar *work;

178:     PetscMalloc1(bs2,&work);
179:     for (i=0; i<nrows; i++) {
180:       PetscInt ilen;
181:       mat_i = c->i[i];
182:       mat_j = c->j + mat_i;
183:       mat_a = c->a + mat_i*bs2;
184:       ilen  = c->ilen[i];
185:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
186:     }
187:     PetscFree(work);
188:   }

190:   /* Free work space */
191:   ISRestoreIndices(iscol,&icol);
192:   PetscFree(smap);
193:   PetscFree(lens);
194:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
195:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

197:   ISRestoreIndices(isrow,&irow);
198:   *B   = C;
199:   return 0;
200: }

202: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
203: {
204:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
205:   IS             is1,is2;
206:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
207:   const PetscInt *irow,*icol;

209:   ISGetIndices(isrow,&irow);
210:   ISGetIndices(iscol,&icol);
211:   ISGetLocalSize(isrow,&nrows);
212:   ISGetLocalSize(iscol,&ncols);

214:   /* Verify if the indices corespond to each element in a block
215:    and form the IS with compressed IS */
216:   maxmnbs = PetscMax(a->mbs,a->nbs);
217:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
218:   PetscArrayzero(vary,a->mbs);
219:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
220:   for (i=0; i<a->mbs; i++) {
222:   }
223:   count = 0;
224:   for (i=0; i<nrows; i++) {
225:     PetscInt j = irow[i] / bs;
226:     if ((vary[j]--)==bs) iary[count++] = j;
227:   }
228:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

230:   PetscArrayzero(vary,a->nbs);
231:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
232:   for (i=0; i<a->nbs; i++) {
234:   }
235:   count = 0;
236:   for (i=0; i<ncols; i++) {
237:     PetscInt j = icol[i] / bs;
238:     if ((vary[j]--)==bs) iary[count++] = j;
239:   }
240:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
241:   ISRestoreIndices(isrow,&irow);
242:   ISRestoreIndices(iscol,&icol);
243:   PetscFree2(vary,iary);

245:   MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
246:   ISDestroy(&is1);
247:   ISDestroy(&is2);

249:   if (isrow != iscol) {
250:     PetscBool isequal;
251:     ISEqual(isrow,iscol,&isequal);
252:     if (!isequal) {
253:       MatSeqSBAIJZeroOps_Private(*B);
254:     }
255:   }
256:   return 0;
257: }

259: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
261:   PetscInt       i;

263:   if (scall == MAT_INITIAL_MATRIX) {
264:     PetscCalloc1(n+1,B);
265:   }

267:   for (i=0; i<n; i++) {
268:     MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
269:   }
270:   return 0;
271: }

273: /* -------------------------------------------------------*/
274: /* Should check that shapes of vectors and matrices match */
275: /* -------------------------------------------------------*/

277: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
280:   PetscScalar       *z,x1,x2,zero=0.0;
281:   const PetscScalar *x,*xb;
282:   const MatScalar   *v;
283:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
284:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
285:   PetscInt          nonzerorow=0;

287:   VecSet(zz,zero);
288:   if (!a->nz) return 0;
289:   VecGetArrayRead(xx,&x);
290:   VecGetArray(zz,&z);

292:   v  = a->a;
293:   xb = x;

295:   for (i=0; i<mbs; i++) {
296:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
297:     x1          = xb[0]; x2 = xb[1];
298:     ib          = aj + *ai;
299:     jmin        = 0;
300:     nonzerorow += (n>0);
301:     if (*ib == i) {     /* (diag of A)*x */
302:       z[2*i]   += v[0]*x1 + v[2]*x2;
303:       z[2*i+1] += v[2]*x1 + v[3]*x2;
304:       v        += 4; jmin++;
305:     }
306:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
307:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
308:     for (j=jmin; j<n; j++) {
309:       /* (strict lower triangular part of A)*x  */
310:       cval       = ib[j]*2;
311:       z[cval]   += v[0]*x1 + v[1]*x2;
312:       z[cval+1] += v[2]*x1 + v[3]*x2;
313:       /* (strict upper triangular part of A)*x  */
314:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
315:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
316:       v        += 4;
317:     }
318:     xb +=2; ai++;
319:   }

321:   VecRestoreArrayRead(xx,&x);
322:   VecRestoreArray(zz,&z);
323:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
324:   return 0;
325: }

327: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
328: {
329:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
330:   PetscScalar       *z,x1,x2,x3,zero=0.0;
331:   const PetscScalar *x,*xb;
332:   const MatScalar   *v;
333:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
334:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
335:   PetscInt          nonzerorow=0;

337:   VecSet(zz,zero);
338:   if (!a->nz) return 0;
339:   VecGetArrayRead(xx,&x);
340:   VecGetArray(zz,&z);

342:   v  = a->a;
343:   xb = x;

345:   for (i=0; i<mbs; i++) {
346:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
347:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
348:     ib          = aj + *ai;
349:     jmin        = 0;
350:     nonzerorow += (n>0);
351:     if (*ib == i) {     /* (diag of A)*x */
352:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
353:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
354:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
355:       v        += 9; jmin++;
356:     }
357:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
358:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
359:     for (j=jmin; j<n; j++) {
360:       /* (strict lower triangular part of A)*x  */
361:       cval       = ib[j]*3;
362:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
363:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
364:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
365:       /* (strict upper triangular part of A)*x  */
366:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
367:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
368:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
369:       v        += 9;
370:     }
371:     xb +=3; ai++;
372:   }

374:   VecRestoreArrayRead(xx,&x);
375:   VecRestoreArray(zz,&z);
376:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
377:   return 0;
378: }

380: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
381: {
382:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
383:   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
384:   const PetscScalar *x,*xb;
385:   const MatScalar   *v;
386:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
387:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
388:   PetscInt          nonzerorow = 0;

390:   VecSet(zz,zero);
391:   if (!a->nz) return 0;
392:   VecGetArrayRead(xx,&x);
393:   VecGetArray(zz,&z);

395:   v  = a->a;
396:   xb = x;

398:   for (i=0; i<mbs; i++) {
399:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
400:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
401:     ib          = aj + *ai;
402:     jmin        = 0;
403:     nonzerorow += (n>0);
404:     if (*ib == i) {     /* (diag of A)*x */
405:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
406:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
407:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
408:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
409:       v        += 16; jmin++;
410:     }
411:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
412:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
413:     for (j=jmin; j<n; j++) {
414:       /* (strict lower triangular part of A)*x  */
415:       cval       = ib[j]*4;
416:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
417:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
418:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
419:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
420:       /* (strict upper triangular part of A)*x  */
421:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
422:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
423:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
424:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
425:       v        += 16;
426:     }
427:     xb +=4; ai++;
428:   }

430:   VecRestoreArrayRead(xx,&x);
431:   VecRestoreArray(zz,&z);
432:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
433:   return 0;
434: }

436: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
437: {
438:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
439:   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
440:   const PetscScalar *x,*xb;
441:   const MatScalar   *v;
442:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
443:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
444:   PetscInt          nonzerorow=0;

446:   VecSet(zz,zero);
447:   if (!a->nz) return 0;
448:   VecGetArrayRead(xx,&x);
449:   VecGetArray(zz,&z);

451:   v  = a->a;
452:   xb = x;

454:   for (i=0; i<mbs; i++) {
455:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
456:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
457:     ib          = aj + *ai;
458:     jmin        = 0;
459:     nonzerorow += (n>0);
460:     if (*ib == i) {      /* (diag of A)*x */
461:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
462:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
463:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
464:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
465:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
466:       v        += 25; jmin++;
467:     }
468:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
469:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
470:     for (j=jmin; j<n; j++) {
471:       /* (strict lower triangular part of A)*x  */
472:       cval       = ib[j]*5;
473:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
474:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
475:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
476:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
477:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
478:       /* (strict upper triangular part of A)*x  */
479:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
480:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
481:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
482:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
483:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
484:       v        += 25;
485:     }
486:     xb +=5; ai++;
487:   }

489:   VecRestoreArrayRead(xx,&x);
490:   VecRestoreArray(zz,&z);
491:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
492:   return 0;
493: }

495: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
496: {
497:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
498:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
499:   const PetscScalar *x,*xb;
500:   const MatScalar   *v;
501:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
502:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
503:   PetscInt          nonzerorow=0;

505:   VecSet(zz,zero);
506:   if (!a->nz) return 0;
507:   VecGetArrayRead(xx,&x);
508:   VecGetArray(zz,&z);

510:   v  = a->a;
511:   xb = x;

513:   for (i=0; i<mbs; i++) {
514:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
515:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
516:     ib          = aj + *ai;
517:     jmin        = 0;
518:     nonzerorow += (n>0);
519:     if (*ib == i) {      /* (diag of A)*x */
520:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
521:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
522:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
523:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
524:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
525:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
526:       v        += 36; jmin++;
527:     }
528:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
529:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
530:     for (j=jmin; j<n; j++) {
531:       /* (strict lower triangular part of A)*x  */
532:       cval       = ib[j]*6;
533:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
534:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
535:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
536:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
537:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
538:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
539:       /* (strict upper triangular part of A)*x  */
540:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
541:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
542:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
543:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
544:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
545:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
546:       v        += 36;
547:     }
548:     xb +=6; ai++;
549:   }

551:   VecRestoreArrayRead(xx,&x);
552:   VecRestoreArray(zz,&z);
553:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
554:   return 0;
555: }

557: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
558: {
559:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
560:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
561:   const PetscScalar *x,*xb;
562:   const MatScalar   *v;
563:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
564:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
565:   PetscInt          nonzerorow=0;

567:   VecSet(zz,zero);
568:   if (!a->nz) return 0;
569:   VecGetArrayRead(xx,&x);
570:   VecGetArray(zz,&z);

572:   v  = a->a;
573:   xb = x;

575:   for (i=0; i<mbs; i++) {
576:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
577:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
578:     ib          = aj + *ai;
579:     jmin        = 0;
580:     nonzerorow += (n>0);
581:     if (*ib == i) {      /* (diag of A)*x */
582:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
583:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
584:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
585:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
586:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
587:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
588:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
589:       v        += 49; jmin++;
590:     }
591:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
592:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
593:     for (j=jmin; j<n; j++) {
594:       /* (strict lower triangular part of A)*x  */
595:       cval       = ib[j]*7;
596:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
597:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
598:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
599:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
600:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
601:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
602:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
603:       /* (strict upper triangular part of A)*x  */
604:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
605:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
606:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
607:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
608:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
609:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
610:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
611:       v       += 49;
612:     }
613:     xb +=7; ai++;
614:   }
615:   VecRestoreArrayRead(xx,&x);
616:   VecRestoreArray(zz,&z);
617:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
618:   return 0;
619: }

621: /*
622:     This will not work with MatScalar == float because it calls the BLAS
623: */
624: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
625: {
626:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
627:   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
628:   const PetscScalar *x,*x_ptr,*xb;
629:   const MatScalar   *v;
630:   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
631:   const PetscInt    *idx,*aj,*ii;
632:   PetscInt          nonzerorow=0;

634:   VecSet(zz,zero);
635:   if (!a->nz) return 0;
636:   VecGetArrayRead(xx,&x);
637:   VecGetArray(zz,&z);

639:   x_ptr = x;
640:   z_ptr = z;

642:   aj = a->j;
643:   v  = a->a;
644:   ii = a->i;

646:   if (!a->mult_work) {
647:     PetscMalloc1(A->rmap->N+1,&a->mult_work);
648:   }
649:   work = a->mult_work;

651:   for (i=0; i<mbs; i++) {
652:     n           = ii[1] - ii[0]; ncols = n*bs;
653:     workt       = work; idx=aj+ii[0];
654:     nonzerorow += (n>0);

656:     /* upper triangular part */
657:     for (j=0; j<n; j++) {
658:       xb = x_ptr + bs*(*idx++);
659:       for (k=0; k<bs; k++) workt[k] = xb[k];
660:       workt += bs;
661:     }
662:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
663:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

665:     /* strict lower triangular part */
666:     idx = aj+ii[0];
667:     if (n && *idx == i) {
668:       ncols -= bs; v += bs2; idx++; n--;
669:     }

671:     if (ncols > 0) {
672:       workt = work;
673:       PetscArrayzero(workt,ncols);
674:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
675:       for (j=0; j<n; j++) {
676:         zb = z_ptr + bs*(*idx++);
677:         for (k=0; k<bs; k++) zb[k] += workt[k];
678:         workt += bs;
679:       }
680:     }
681:     x += bs; v += n*bs2; z += bs; ii++;
682:   }

684:   VecRestoreArrayRead(xx,&x);
685:   VecRestoreArray(zz,&z);
686:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
687:   return 0;
688: }

690: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
691: {
692:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
693:   PetscScalar       *z,x1;
694:   const PetscScalar *x,*xb;
695:   const MatScalar   *v;
696:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
697:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
698:   PetscInt          nonzerorow=0;
699: #if defined(PETSC_USE_COMPLEX)
700:   const int         aconj = A->hermitian;
701: #else
702:   const int         aconj = 0;
703: #endif

705:   VecCopy(yy,zz);
706:   if (!a->nz) return 0;
707:   VecGetArrayRead(xx,&x);
708:   VecGetArray(zz,&z);
709:   v    = a->a;
710:   xb   = x;

712:   for (i=0; i<mbs; i++) {
713:     n           = ai[1] - ai[0]; /* length of i_th row of A */
714:     x1          = xb[0];
715:     ib          = aj + *ai;
716:     jmin        = 0;
717:     nonzerorow += (n>0);
718:     if (n && *ib == i) {            /* (diag of A)*x */
719:       z[i] += *v++ * x[*ib++]; jmin++;
720:     }
721:     if (aconj) {
722:       for (j=jmin; j<n; j++) {
723:         cval    = *ib;
724:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
725:         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
726:       }
727:     } else {
728:       for (j=jmin; j<n; j++) {
729:         cval    = *ib;
730:         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
731:         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
732:       }
733:     }
734:     xb++; ai++;
735:   }

737:   VecRestoreArrayRead(xx,&x);
738:   VecRestoreArray(zz,&z);

740:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
741:   return 0;
742: }

744: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
745: {
746:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
747:   PetscScalar       *z,x1,x2;
748:   const PetscScalar *x,*xb;
749:   const MatScalar   *v;
750:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
751:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
752:   PetscInt          nonzerorow=0;

754:   VecCopy(yy,zz);
755:   if (!a->nz) return 0;
756:   VecGetArrayRead(xx,&x);
757:   VecGetArray(zz,&z);

759:   v  = a->a;
760:   xb = x;

762:   for (i=0; i<mbs; i++) {
763:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
764:     x1          = xb[0]; x2 = xb[1];
765:     ib          = aj + *ai;
766:     jmin        = 0;
767:     nonzerorow += (n>0);
768:     if (n && *ib == i) {      /* (diag of A)*x */
769:       z[2*i]   += v[0]*x1 + v[2]*x2;
770:       z[2*i+1] += v[2]*x1 + v[3]*x2;
771:       v        += 4; jmin++;
772:     }
773:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
774:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
775:     for (j=jmin; j<n; j++) {
776:       /* (strict lower triangular part of A)*x  */
777:       cval       = ib[j]*2;
778:       z[cval]   += v[0]*x1 + v[1]*x2;
779:       z[cval+1] += v[2]*x1 + v[3]*x2;
780:       /* (strict upper triangular part of A)*x  */
781:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
782:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
783:       v        += 4;
784:     }
785:     xb +=2; ai++;
786:   }
787:   VecRestoreArrayRead(xx,&x);
788:   VecRestoreArray(zz,&z);

790:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
791:   return 0;
792: }

794: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
795: {
796:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
797:   PetscScalar       *z,x1,x2,x3;
798:   const PetscScalar *x,*xb;
799:   const MatScalar   *v;
800:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
801:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
802:   PetscInt          nonzerorow=0;

804:   VecCopy(yy,zz);
805:   if (!a->nz) return 0;
806:   VecGetArrayRead(xx,&x);
807:   VecGetArray(zz,&z);

809:   v  = a->a;
810:   xb = x;

812:   for (i=0; i<mbs; i++) {
813:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
814:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
815:     ib          = aj + *ai;
816:     jmin        = 0;
817:     nonzerorow += (n>0);
818:     if (n && *ib == i) {     /* (diag of A)*x */
819:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
820:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
821:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
822:       v        += 9; jmin++;
823:     }
824:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
825:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
826:     for (j=jmin; j<n; j++) {
827:       /* (strict lower triangular part of A)*x  */
828:       cval       = ib[j]*3;
829:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
830:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
831:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
832:       /* (strict upper triangular part of A)*x  */
833:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
834:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
835:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
836:       v        += 9;
837:     }
838:     xb +=3; ai++;
839:   }

841:   VecRestoreArrayRead(xx,&x);
842:   VecRestoreArray(zz,&z);

844:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
845:   return 0;
846: }

848: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
849: {
850:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
851:   PetscScalar       *z,x1,x2,x3,x4;
852:   const PetscScalar *x,*xb;
853:   const MatScalar   *v;
854:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
855:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
856:   PetscInt          nonzerorow=0;

858:   VecCopy(yy,zz);
859:   if (!a->nz) return 0;
860:   VecGetArrayRead(xx,&x);
861:   VecGetArray(zz,&z);

863:   v  = a->a;
864:   xb = x;

866:   for (i=0; i<mbs; i++) {
867:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
868:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
869:     ib          = aj + *ai;
870:     jmin        = 0;
871:     nonzerorow += (n>0);
872:     if (n && *ib == i) {      /* (diag of A)*x */
873:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
874:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
875:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
876:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
877:       v        += 16; jmin++;
878:     }
879:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
880:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
881:     for (j=jmin; j<n; j++) {
882:       /* (strict lower triangular part of A)*x  */
883:       cval       = ib[j]*4;
884:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
885:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
886:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
887:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
888:       /* (strict upper triangular part of A)*x  */
889:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
890:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
891:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
892:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
893:       v        += 16;
894:     }
895:     xb +=4; ai++;
896:   }

898:   VecRestoreArrayRead(xx,&x);
899:   VecRestoreArray(zz,&z);

901:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
902:   return 0;
903: }

905: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
906: {
907:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
908:   PetscScalar       *z,x1,x2,x3,x4,x5;
909:   const PetscScalar *x,*xb;
910:   const MatScalar   *v;
911:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
912:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
913:   PetscInt          nonzerorow=0;

915:   VecCopy(yy,zz);
916:   if (!a->nz) return 0;
917:   VecGetArrayRead(xx,&x);
918:   VecGetArray(zz,&z);

920:   v  = a->a;
921:   xb = x;

923:   for (i=0; i<mbs; i++) {
924:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
925:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
926:     ib          = aj + *ai;
927:     jmin        = 0;
928:     nonzerorow += (n>0);
929:     if (n && *ib == i) {      /* (diag of A)*x */
930:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
931:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
932:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
933:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
934:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
935:       v        += 25; jmin++;
936:     }
937:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
938:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
939:     for (j=jmin; j<n; j++) {
940:       /* (strict lower triangular part of A)*x  */
941:       cval       = ib[j]*5;
942:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
943:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
944:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
945:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
946:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
947:       /* (strict upper triangular part of A)*x  */
948:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
949:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
950:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
951:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
952:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
953:       v        += 25;
954:     }
955:     xb +=5; ai++;
956:   }

958:   VecRestoreArrayRead(xx,&x);
959:   VecRestoreArray(zz,&z);

961:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
962:   return 0;
963: }

965: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
966: {
967:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
968:   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
969:   const PetscScalar *x,*xb;
970:   const MatScalar   *v;
971:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
972:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
973:   PetscInt          nonzerorow=0;

975:   VecCopy(yy,zz);
976:   if (!a->nz) return 0;
977:   VecGetArrayRead(xx,&x);
978:   VecGetArray(zz,&z);

980:   v  = a->a;
981:   xb = x;

983:   for (i=0; i<mbs; i++) {
984:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
985:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
986:     ib          = aj + *ai;
987:     jmin        = 0;
988:     nonzerorow += (n>0);
989:     if (n && *ib == i) {     /* (diag of A)*x */
990:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
991:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
992:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
993:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
994:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
995:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
996:       v        += 36; jmin++;
997:     }
998:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
999:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1000:     for (j=jmin; j<n; j++) {
1001:       /* (strict lower triangular part of A)*x  */
1002:       cval       = ib[j]*6;
1003:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1004:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1005:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1006:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1007:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1008:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1009:       /* (strict upper triangular part of A)*x  */
1010:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1011:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1012:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1013:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1014:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1015:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1016:       v        += 36;
1017:     }
1018:     xb +=6; ai++;
1019:   }

1021:   VecRestoreArrayRead(xx,&x);
1022:   VecRestoreArray(zz,&z);

1024:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1025:   return 0;
1026: }

1028: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1029: {
1030:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1031:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1032:   const PetscScalar *x,*xb;
1033:   const MatScalar   *v;
1034:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1035:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1036:   PetscInt          nonzerorow=0;

1038:   VecCopy(yy,zz);
1039:   if (!a->nz) return 0;
1040:   VecGetArrayRead(xx,&x);
1041:   VecGetArray(zz,&z);

1043:   v  = a->a;
1044:   xb = x;

1046:   for (i=0; i<mbs; i++) {
1047:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1048:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1049:     ib          = aj + *ai;
1050:     jmin        = 0;
1051:     nonzerorow += (n>0);
1052:     if (n && *ib == i) {     /* (diag of A)*x */
1053:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1054:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1055:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1056:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1057:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1058:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1059:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1060:       v        += 49; jmin++;
1061:     }
1062:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1063:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1064:     for (j=jmin; j<n; j++) {
1065:       /* (strict lower triangular part of A)*x  */
1066:       cval       = ib[j]*7;
1067:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1068:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1069:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1070:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1071:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1072:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1073:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1074:       /* (strict upper triangular part of A)*x  */
1075:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1076:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1077:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1078:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1079:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1080:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1081:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1082:       v       += 49;
1083:     }
1084:     xb +=7; ai++;
1085:   }

1087:   VecRestoreArrayRead(xx,&x);
1088:   VecRestoreArray(zz,&z);

1090:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1091:   return 0;
1092: }

1094: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1095: {
1096:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1097:   PetscScalar       *z,*z_ptr=NULL,*zb,*work,*workt;
1098:   const PetscScalar *x,*x_ptr,*xb;
1099:   const MatScalar   *v;
1100:   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1101:   const PetscInt    *idx,*aj,*ii;
1102:   PetscInt          nonzerorow=0;

1104:   VecCopy(yy,zz);
1105:   if (!a->nz) return 0;
1106:   VecGetArrayRead(xx,&x); x_ptr=x;
1107:   VecGetArray(zz,&z); z_ptr=z;

1109:   aj = a->j;
1110:   v  = a->a;
1111:   ii = a->i;

1113:   if (!a->mult_work) {
1114:     PetscMalloc1(A->rmap->n+1,&a->mult_work);
1115:   }
1116:   work = a->mult_work;

1118:   for (i=0; i<mbs; i++) {
1119:     n           = ii[1] - ii[0]; ncols = n*bs;
1120:     workt       = work; idx=aj+ii[0];
1121:     nonzerorow += (n>0);

1123:     /* upper triangular part */
1124:     for (j=0; j<n; j++) {
1125:       xb = x_ptr + bs*(*idx++);
1126:       for (k=0; k<bs; k++) workt[k] = xb[k];
1127:       workt += bs;
1128:     }
1129:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1130:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1132:     /* strict lower triangular part */
1133:     idx = aj+ii[0];
1134:     if (n && *idx == i) {
1135:       ncols -= bs; v += bs2; idx++; n--;
1136:     }
1137:     if (ncols > 0) {
1138:       workt = work;
1139:       PetscArrayzero(workt,ncols);
1140:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1141:       for (j=0; j<n; j++) {
1142:         zb = z_ptr + bs*(*idx++);
1143:         for (k=0; k<bs; k++) zb[k] += workt[k];
1144:         workt += bs;
1145:       }
1146:     }

1148:     x += bs; v += n*bs2; z += bs; ii++;
1149:   }

1151:   VecRestoreArrayRead(xx,&x);
1152:   VecRestoreArray(zz,&z);

1154:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1155:   return 0;
1156: }

1158: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1159: {
1160:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1161:   PetscScalar    oalpha = alpha;
1162:   PetscBLASInt   one = 1,totalnz;

1164:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1165:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1166:   PetscLogFlops(totalnz);
1167:   return 0;
1168: }

1170: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1171: {
1172:   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1173:   const MatScalar *v       = a->a;
1174:   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1175:   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1176:   const PetscInt  *aj=a->j,*col;

1178:   if (!a->nz) {
1179:     *norm = 0.0;
1180:     return 0;
1181:   }
1182:   if (type == NORM_FROBENIUS) {
1183:     for (k=0; k<mbs; k++) {
1184:       jmin = a->i[k];
1185:       jmax = a->i[k+1];
1186:       col  = aj + jmin;
1187:       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
1188:         for (i=0; i<bs2; i++) {
1189:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1190:         }
1191:         jmin++;
1192:       }
1193:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1194:         for (i=0; i<bs2; i++) {
1195:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1196:         }
1197:       }
1198:     }
1199:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1200:     PetscLogFlops(2.0*bs2*a->nz);
1201:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1202:     PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1203:     for (i=0; i<mbs; i++) jl[i] = mbs;
1204:     il[0] = 0;

1206:     *norm = 0.0;
1207:     for (k=0; k<mbs; k++) { /* k_th block row */
1208:       for (j=0; j<bs; j++) sum[j]=0.0;
1209:       /*-- col sum --*/
1210:       i = jl[k]; /* first |A(i,k)| to be added */
1211:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1212:                   at step k */
1213:       while (i<mbs) {
1214:         nexti = jl[i];  /* next block row to be added */
1215:         ik    = il[i];  /* block index of A(i,k) in the array a */
1216:         for (j=0; j<bs; j++) {
1217:           v = a->a + ik*bs2 + j*bs;
1218:           for (k1=0; k1<bs; k1++) {
1219:             sum[j] += PetscAbsScalar(*v); v++;
1220:           }
1221:         }
1222:         /* update il, jl */
1223:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1224:         jmax = a->i[i+1];
1225:         if (jmin < jmax) {
1226:           il[i] = jmin;
1227:           j     = a->j[jmin];
1228:           jl[i] = jl[j]; jl[j]=i;
1229:         }
1230:         i = nexti;
1231:       }
1232:       /*-- row sum --*/
1233:       jmin = a->i[k];
1234:       jmax = a->i[k+1];
1235:       for (i=jmin; i<jmax; i++) {
1236:         for (j=0; j<bs; j++) {
1237:           v = a->a + i*bs2 + j;
1238:           for (k1=0; k1<bs; k1++) {
1239:             sum[j] += PetscAbsScalar(*v); v += bs;
1240:           }
1241:         }
1242:       }
1243:       /* add k_th block row to il, jl */
1244:       col = aj+jmin;
1245:       if (jmax - jmin > 0 && *col == k) jmin++;
1246:       if (jmin < jmax) {
1247:         il[k] = jmin;
1248:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1249:       }
1250:       for (j=0; j<bs; j++) {
1251:         if (sum[j] > *norm) *norm = sum[j];
1252:       }
1253:     }
1254:     PetscFree3(sum,il,jl);
1255:     PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1256:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1257:   return 0;
1258: }

1260: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1261: {
1262:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1264:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1265:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1266:     *flg = PETSC_FALSE;
1267:     return 0;
1268:   }

1270:   /* if the a->i are the same */
1271:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1272:   if (!*flg) return 0;

1274:   /* if a->j are the same */
1275:   PetscArraycmp(a->j,b->j,a->nz,flg);
1276:   if (!*flg) return 0;

1278:   /* if a->a are the same */
1279:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1280:   return 0;
1281: }

1283: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1284: {
1285:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1286:   PetscInt        i,j,k,row,bs,ambs,bs2;
1287:   const PetscInt  *ai,*aj;
1288:   PetscScalar     *x,zero = 0.0;
1289:   const MatScalar *aa,*aa_j;

1291:   bs = A->rmap->bs;

1294:   aa   = a->a;
1295:   ambs = a->mbs;

1297:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1298:     PetscInt *diag=a->diag;
1299:     aa   = a->a;
1300:     ambs = a->mbs;
1301:     VecGetArray(v,&x);
1302:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1303:     VecRestoreArray(v,&x);
1304:     return 0;
1305:   }

1307:   ai   = a->i;
1308:   aj   = a->j;
1309:   bs2  = a->bs2;
1310:   VecSet(v,zero);
1311:   if (!a->nz) return 0;
1312:   VecGetArray(v,&x);
1313:   for (i=0; i<ambs; i++) {
1314:     j = ai[i];
1315:     if (aj[j] == i) {    /* if this is a diagonal element */
1316:       row  = i*bs;
1317:       aa_j = aa + j*bs2;
1318:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1319:     }
1320:   }
1321:   VecRestoreArray(v,&x);
1322:   return 0;
1323: }

1325: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1326: {
1327:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1328:   PetscScalar       x;
1329:   const PetscScalar *l,*li,*ri;
1330:   MatScalar         *aa,*v;
1331:   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1332:   const PetscInt    *ai,*aj;
1333:   PetscBool         flg;

1335:   if (ll != rr) {
1336:     VecEqual(ll,rr,&flg);
1338:   }
1339:   if (!ll) return 0;
1340:   ai  = a->i;
1341:   aj  = a->j;
1342:   aa  = a->a;
1343:   m   = A->rmap->N;
1344:   bs  = A->rmap->bs;
1345:   mbs = a->mbs;
1346:   bs2 = a->bs2;

1348:   VecGetArrayRead(ll,&l);
1349:   VecGetLocalSize(ll,&lm);
1351:   for (i=0; i<mbs; i++) { /* for each block row */
1352:     M  = ai[i+1] - ai[i];
1353:     li = l + i*bs;
1354:     v  = aa + bs2*ai[i];
1355:     for (j=0; j<M; j++) { /* for each block */
1356:       ri = l + bs*aj[ai[i]+j];
1357:       for (k=0; k<bs; k++) {
1358:         x = ri[k];
1359:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1360:       }
1361:     }
1362:   }
1363:   VecRestoreArrayRead(ll,&l);
1364:   PetscLogFlops(2.0*a->nz);
1365:   return 0;
1366: }

1368: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1369: {
1370:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1372:   info->block_size   = a->bs2;
1373:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1374:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1375:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1376:   info->assemblies   = A->num_ass;
1377:   info->mallocs      = A->info.mallocs;
1378:   info->memory       = ((PetscObject)A)->mem;
1379:   if (A->factortype) {
1380:     info->fill_ratio_given  = A->info.fill_ratio_given;
1381:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1382:     info->factor_mallocs    = A->info.factor_mallocs;
1383:   } else {
1384:     info->fill_ratio_given  = 0;
1385:     info->fill_ratio_needed = 0;
1386:     info->factor_mallocs    = 0;
1387:   }
1388:   return 0;
1389: }

1391: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1392: {
1393:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1395:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1396:   return 0;
1397: }

1399: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1400: {
1401:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1402:   PetscInt        i,j,n,row,col,bs,mbs;
1403:   const PetscInt  *ai,*aj;
1404:   PetscReal       atmp;
1405:   const MatScalar *aa;
1406:   PetscScalar     *x;
1407:   PetscInt        ncols,brow,bcol,krow,kcol;

1411:   bs  = A->rmap->bs;
1412:   aa  = a->a;
1413:   ai  = a->i;
1414:   aj  = a->j;
1415:   mbs = a->mbs;

1417:   VecSet(v,0.0);
1418:   VecGetArray(v,&x);
1419:   VecGetLocalSize(v,&n);
1421:   for (i=0; i<mbs; i++) {
1422:     ncols = ai[1] - ai[0]; ai++;
1423:     brow  = bs*i;
1424:     for (j=0; j<ncols; j++) {
1425:       bcol = bs*(*aj);
1426:       for (kcol=0; kcol<bs; kcol++) {
1427:         col = bcol + kcol;      /* col index */
1428:         for (krow=0; krow<bs; krow++) {
1429:           atmp = PetscAbsScalar(*aa); aa++;
1430:           row  = brow + krow;   /* row index */
1431:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1432:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1433:         }
1434:       }
1435:       aj++;
1436:     }
1437:   }
1438:   VecRestoreArray(v,&x);
1439:   return 0;
1440: }

1442: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1443: {
1444:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1445:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1446:   return 0;
1447: }

1449: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1450: {
1451:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1452:   PetscScalar       *z = c;
1453:   const PetscScalar *xb;
1454:   PetscScalar       x1;
1455:   const MatScalar   *v = a->a,*vv;
1456:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1457: #if defined(PETSC_USE_COMPLEX)
1458:   const int         aconj = A->hermitian;
1459: #else
1460:   const int         aconj = 0;
1461: #endif

1463:   for (i=0; i<mbs; i++) {
1464:     n = ii[1] - ii[0]; ii++;
1465:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1466:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1467:     jj = idx;
1468:     vv = v;
1469:     for (k=0; k<cn; k++) {
1470:       idx = jj;
1471:       v = vv;
1472:       for (j=0; j<n; j++) {
1473:         xb = b + (*idx); x1 = xb[0+k*bm];
1474:         z[0+k*cm] += v[0]*x1;
1475:         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1476:         v += 1;
1477:         ++idx;
1478:       }
1479:     }
1480:     z += 1;
1481:   }
1482:   return 0;
1483: }

1485: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1486: {
1487:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1488:   PetscScalar       *z = c;
1489:   const PetscScalar *xb;
1490:   PetscScalar       x1,x2;
1491:   const MatScalar   *v = a->a,*vv;
1492:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1494:   for (i=0; i<mbs; i++) {
1495:     n = ii[1] - ii[0]; ii++;
1496:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1497:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1498:     jj = idx;
1499:     vv = v;
1500:     for (k=0; k<cn; k++) {
1501:       idx = jj;
1502:       v = vv;
1503:       for (j=0; j<n; j++) {
1504:         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1505:         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1506:         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1507:         if (*idx != i) {
1508:           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1509:           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1510:         }
1511:         v += 4;
1512:         ++idx;
1513:       }
1514:     }
1515:     z += 2;
1516:   }
1517:   return 0;
1518: }

1520: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1521: {
1522:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1523:   PetscScalar       *z = c;
1524:   const PetscScalar *xb;
1525:   PetscScalar       x1,x2,x3;
1526:   const MatScalar   *v = a->a,*vv;
1527:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1529:   for (i=0; i<mbs; i++) {
1530:     n = ii[1] - ii[0]; ii++;
1531:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1532:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1533:     jj = idx;
1534:     vv = v;
1535:     for (k=0; k<cn; k++) {
1536:       idx = jj;
1537:       v = vv;
1538:       for (j=0; j<n; j++) {
1539:         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1540:         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1541:         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1542:         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1543:         if (*idx != i) {
1544:           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1545:           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1546:           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1547:         }
1548:         v += 9;
1549:         ++idx;
1550:       }
1551:     }
1552:     z += 3;
1553:   }
1554:   return 0;
1555: }

1557: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1558: {
1559:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1560:   PetscScalar       *z = c;
1561:   const PetscScalar *xb;
1562:   PetscScalar       x1,x2,x3,x4;
1563:   const MatScalar   *v = a->a,*vv;
1564:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1566:   for (i=0; i<mbs; i++) {
1567:     n = ii[1] - ii[0]; ii++;
1568:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1569:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1570:     jj = idx;
1571:     vv = v;
1572:     for (k=0; k<cn; k++) {
1573:       idx = jj;
1574:       v = vv;
1575:       for (j=0; j<n; j++) {
1576:         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1577:         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1578:         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1579:         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1580:         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1581:         if (*idx != i) {
1582:           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1583:           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1584:           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1585:           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1586:         }
1587:         v += 16;
1588:         ++idx;
1589:       }
1590:     }
1591:     z += 4;
1592:   }
1593:   return 0;
1594: }

1596: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1597: {
1598:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1599:   PetscScalar       *z = c;
1600:   const PetscScalar *xb;
1601:   PetscScalar       x1,x2,x3,x4,x5;
1602:   const MatScalar   *v = a->a,*vv;
1603:   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;

1605:   for (i=0; i<mbs; i++) {
1606:     n = ii[1] - ii[0]; ii++;
1607:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1608:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1609:     jj = idx;
1610:     vv = v;
1611:     for (k=0; k<cn; k++) {
1612:       idx = jj;
1613:       v = vv;
1614:       for (j=0; j<n; j++) {
1615:         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1616:         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1617:         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1618:         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1619:         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1620:         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1621:         if (*idx != i) {
1622:           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1623:           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1624:           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1625:           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1626:           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1627:         }
1628:         v += 25;
1629:         ++idx;
1630:       }
1631:     }
1632:     z += 5;
1633:   }
1634:   return 0;
1635: }

1637: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1638: {
1639:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1640:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1641:   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1642:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1643:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1644:   PetscBLASInt      bbs,bcn,bbm,bcm;
1645:   PetscScalar       *z = NULL;
1646:   PetscScalar       *c,*b;
1647:   const MatScalar   *v;
1648:   const PetscInt    *idx,*ii;
1649:   PetscScalar       _DOne=1.0;

1651:   if (!cm || !cn) return 0;
1655:   b = bd->v;
1656:   MatZeroEntries(C);
1657:   MatDenseGetArray(C,&c);
1658:   switch (bs) {
1659:   case 1:
1660:     MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1661:     break;
1662:   case 2:
1663:     MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1664:     break;
1665:   case 3:
1666:     MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1667:     break;
1668:   case 4:
1669:     MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1670:     break;
1671:   case 5:
1672:     MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1673:     break;
1674:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1675:     PetscBLASIntCast(bs,&bbs);
1676:     PetscBLASIntCast(cn,&bcn);
1677:     PetscBLASIntCast(bm,&bbm);
1678:     PetscBLASIntCast(cm,&bcm);
1679:     idx = a->j;
1680:     v   = a->a;
1681:     mbs = a->mbs;
1682:     ii  = a->i;
1683:     z   = c;
1684:     for (i=0; i<mbs; i++) {
1685:       n = ii[1] - ii[0]; ii++;
1686:       for (j=0; j<n; j++) {
1687:         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1688:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1689:         v += bs2;
1690:       }
1691:       z += bs;
1692:     }
1693:   }
1694:   MatDenseRestoreArray(C,&c);
1695:   PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);
1696:   return 0;
1697: }