Actual source code: sbaij2.c

petsc-3.6.4 2016-04-12
Report Typos and Errors
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6: #include <petscblaslapack.h>

 10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 14:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 15:   const PetscInt *idx;
 16:   PetscBT        table_out,table_in;

 19:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 20:   mbs  = a->mbs;
 21:   ai   = a->i;
 22:   aj   = a->j;
 23:   bs   = A->rmap->bs;
 24:   PetscBTCreate(mbs,&table_out);
 25:   PetscMalloc1(mbs+1,&nidx);
 26:   PetscMalloc1(A->rmap->N+1,&nidx2);
 27:   PetscBTCreate(mbs,&table_in);

 29:   for (i=0; i<is_max; i++) { /* for each is */
 30:     isz  = 0;
 31:     PetscBTMemzero(mbs,table_out);

 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 38:     bcol_max = 0;
 39:     for (j=0; j<n; ++j) {
 40:       brow = idx[j]/bs; /* convert the indices into block indices */
 41:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if (!PetscBTLookupSet(table_out,brow)) {
 43:         nidx[isz++] = brow;
 44:         if (bcol_max < brow) bcol_max = brow;
 45:       }
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);

 50:     k = 0;
 51:     for (j=0; j<ov; j++) { /* for each overlap */
 52:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 53:       PetscBTMemzero(mbs,table_in);
 54:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 56:       n = isz;  /* length of the updated is[i] */
 57:       for (brow=0; brow<mbs; brow++) {
 58:         start = ai[brow]; end   = ai[brow+1];
 59:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 60:           for (l = start; l<end; l++) {
 61:             bcol = aj[l];
 62:             if (!PetscBTLookupSet(table_out,bcol)) {
 63:               nidx[isz++] = bcol;
 64:               if (bcol_max < bcol) bcol_max = bcol;
 65:             }
 66:           }
 67:           k++;
 68:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 69:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 70:           for (l = start; l<end; l++) {
 71:             bcol = aj[l];
 72:             if (bcol > bcol_max) break;
 73:             if (PetscBTLookup(table_in,bcol)) {
 74:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 75:               break; /* for l = start; l<end ; l++) */
 76:             }
 77:           }
 78:         }
 79:       }
 80:     } /* for each overlap */

 82:     /* expand the Index Set */
 83:     for (j=0; j<isz; j++) {
 84:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 85:     }
 86:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 87:   }
 88:   PetscBTDestroy(&table_out);
 89:   PetscFree(nidx);
 90:   PetscFree(nidx2);
 91:   PetscBTDestroy(&table_in);
 92:   return(0);
 93: }

 97: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
 98: {
 99:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
100:   PetscErrorCode  ierr;
101:   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102:   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
103:   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104:   const PetscInt  *irow,*aj = a->j,*ai = a->i;
105:   MatScalar       *mat_a;
106:   Mat             C;
107:   PetscBool       flag,sorted;

110:   ISSorted(isrow,&sorted);
111:   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

113:   ISGetIndices(isrow,&irow);
114:   ISGetSize(isrow,&nrows);

116:   PetscMalloc1(oldcols,&smap);
117:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
118:   ssmap = smap;
119:   PetscMalloc1(1+nrows,&lens);
120:   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121:   /* determine lens of each row */
122:   for (i=0; i<nrows; i++) {
123:     kstart  = ai[irow[i]];
124:     kend    = kstart + a->ilen[irow[i]];
125:     lens[i] = 0;
126:     for (k=kstart; k<kend; k++) {
127:       if (ssmap[aj[k]]) lens[i]++;
128:     }
129:   }
130:   /* Create and fill new matrix */
131:   if (scall == MAT_REUSE_MATRIX) {
132:     c = (Mat_SeqSBAIJ*)((*B)->data);

134:     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
135:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
136:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
137:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
138:     C    = *B;
139:   } else {
140:     MatCreate(PetscObjectComm((PetscObject)A),&C);
141:     MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);
142:     MatSetType(C,((PetscObject)A)->type_name);
143:     MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
144:   }
145:   c = (Mat_SeqSBAIJ*)(C->data);
146:   for (i=0; i<nrows; i++) {
147:     row      = irow[i];
148:     kstart   = ai[row];
149:     kend     = kstart + a->ilen[row];
150:     mat_i    = c->i[i];
151:     mat_j    = c->j + mat_i;
152:     mat_a    = c->a + mat_i*bs2;
153:     mat_ilen = c->ilen + i;
154:     for (k=kstart; k<kend; k++) {
155:       if ((tcol=ssmap[a->j[k]])) {
156:         *mat_j++ = tcol - 1;
157:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
158:         mat_a   += bs2;
159:         (*mat_ilen)++;
160:       }
161:     }
162:   }

164:   /* Free work space */
165:   PetscFree(smap);
166:   PetscFree(lens);
167:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
168:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

170:   ISRestoreIndices(isrow,&irow);
171:   *B   = C;
172:   return(0);
173: }

177: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
178: {
179:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
180:   IS             is1;
182:   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
183:   const PetscInt *irow;

186:   if (isrow != iscol) {
187:     PetscBool isequal;
188:     ISEqual(isrow,iscol,&isequal);
189:     if (!isequal)
190:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
191:   }

193:   ISGetIndices(isrow,&irow);
194:   ISGetSize(isrow,&nrows);

196:   /* Verify if the indices corespond to each element in a block
197:    and form the IS with compressed IS */
198:   PetscMalloc2(a->mbs,&vary,a->mbs,&iary);
199:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
200:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;

202:   count = 0;
203:   for (i=0; i<a->mbs; i++) {
204:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
205:     if (vary[i]==bs) iary[count++] = i;
206:   }
207:   ISRestoreIndices(isrow,&irow);
208:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
209:   PetscFree2(vary,iary);

211:   MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);
212:   ISDestroy(&is1);
213:   return(0);
214: }

218: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219: {
221:   PetscInt       i;

224:   if (scall == MAT_INITIAL_MATRIX) {
225:     PetscMalloc1(n+1,B);
226:   }

228:   for (i=0; i<n; i++) {
229:     MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
230:   }
231:   return(0);
232: }

234: /* -------------------------------------------------------*/
235: /* Should check that shapes of vectors and matrices match */
236: /* -------------------------------------------------------*/

240: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
241: {
242:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
243:   PetscScalar       *z,x1,x2,zero=0.0;
244:   const PetscScalar *x,*xb;
245:   const MatScalar   *v;
246:   PetscErrorCode    ierr;
247:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
248:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
249:   PetscInt          nonzerorow=0;

252:   VecSet(zz,zero);
253:   VecGetArrayRead(xx,&x);
254:   VecGetArray(zz,&z);

256:   v  = a->a;
257:   xb = x;

259:   for (i=0; i<mbs; i++) {
260:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
261:     x1          = xb[0]; x2 = xb[1];
262:     ib          = aj + *ai;
263:     jmin        = 0;
264:     nonzerorow += (n>0);
265:     if (*ib == i) {     /* (diag of A)*x */
266:       z[2*i]   += v[0]*x1 + v[2]*x2;
267:       z[2*i+1] += v[2]*x1 + v[3]*x2;
268:       v        += 4; jmin++;
269:     }
270:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
271:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
272:     for (j=jmin; j<n; j++) {
273:       /* (strict lower triangular part of A)*x  */
274:       cval       = ib[j]*2;
275:       z[cval]   += v[0]*x1 + v[1]*x2;
276:       z[cval+1] += v[2]*x1 + v[3]*x2;
277:       /* (strict upper triangular part of A)*x  */
278:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
279:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
280:       v        += 4;
281:     }
282:     xb +=2; ai++;
283:   }

285:   VecRestoreArrayRead(xx,&x);
286:   VecRestoreArray(zz,&z);
287:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
288:   return(0);
289: }

293: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
294: {
295:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
296:   PetscScalar       *z,x1,x2,x3,zero=0.0;
297:   const PetscScalar *x,*xb;
298:   const MatScalar   *v;
299:   PetscErrorCode    ierr;
300:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
301:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
302:   PetscInt          nonzerorow=0;

305:   VecSet(zz,zero);
306:   VecGetArrayRead(xx,&x);
307:   VecGetArray(zz,&z);

309:   v  = a->a;
310:   xb = x;

312:   for (i=0; i<mbs; i++) {
313:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
314:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
315:     ib          = aj + *ai;
316:     jmin        = 0;
317:     nonzerorow += (n>0);
318:     if (*ib == i) {     /* (diag of A)*x */
319:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
320:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
321:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
322:       v        += 9; jmin++;
323:     }
324:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
325:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
326:     for (j=jmin; j<n; j++) {
327:       /* (strict lower triangular part of A)*x  */
328:       cval       = ib[j]*3;
329:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
330:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
331:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
332:       /* (strict upper triangular part of A)*x  */
333:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
334:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
335:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
336:       v        += 9;
337:     }
338:     xb +=3; ai++;
339:   }

341:   VecRestoreArrayRead(xx,&x);
342:   VecRestoreArray(zz,&z);
343:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
344:   return(0);
345: }

349: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
350: {
351:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
352:   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
353:   const PetscScalar *x,*xb;
354:   const MatScalar   *v;
355:   PetscErrorCode    ierr;
356:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
357:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
358:   PetscInt          nonzerorow = 0;

361:   VecSet(zz,zero);
362:   VecGetArrayRead(xx,&x);
363:   VecGetArray(zz,&z);

365:   v  = a->a;
366:   xb = x;

368:   for (i=0; i<mbs; i++) {
369:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
370:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
371:     ib          = aj + *ai;
372:     jmin        = 0;
373:     nonzerorow += (n>0);
374:     if (*ib == i) {     /* (diag of A)*x */
375:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
376:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
377:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
378:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
379:       v        += 16; jmin++;
380:     }
381:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
382:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383:     for (j=jmin; j<n; j++) {
384:       /* (strict lower triangular part of A)*x  */
385:       cval       = ib[j]*4;
386:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
387:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
388:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
389:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
390:       /* (strict upper triangular part of A)*x  */
391:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
392:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
393:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
394:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
395:       v        += 16;
396:     }
397:     xb +=4; ai++;
398:   }

400:   VecRestoreArrayRead(xx,&x);
401:   VecRestoreArray(zz,&z);
402:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
403:   return(0);
404: }

408: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
409: {
410:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411:   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
412:   const PetscScalar *x,*xb;
413:   const MatScalar   *v;
414:   PetscErrorCode    ierr;
415:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416:   const PetscInt    *aj = a->j,*ai = a->i,*ib;
417:   PetscInt          nonzerorow=0;

420:   VecSet(zz,zero);
421:   VecGetArrayRead(xx,&x);
422:   VecGetArray(zz,&z);

424:   v  = a->a;
425:   xb = x;

427:   for (i=0; i<mbs; i++) {
428:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
429:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
430:     ib          = aj + *ai;
431:     jmin        = 0;
432:     nonzerorow += (n>0);
433:     if (*ib == i) {      /* (diag of A)*x */
434:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
435:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
436:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
437:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
438:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
439:       v        += 25; jmin++;
440:     }
441:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
442:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
443:     for (j=jmin; j<n; j++) {
444:       /* (strict lower triangular part of A)*x  */
445:       cval       = ib[j]*5;
446:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
447:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
448:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
449:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
450:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
451:       /* (strict upper triangular part of A)*x  */
452:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
453:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
454:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
455:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
456:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
457:       v        += 25;
458:     }
459:     xb +=5; ai++;
460:   }

462:   VecRestoreArrayRead(xx,&x);
463:   VecRestoreArray(zz,&z);
464:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
465:   return(0);
466: }


471: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
472: {
473:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
474:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
475:   const PetscScalar *x,*xb;
476:   const MatScalar   *v;
477:   PetscErrorCode    ierr;
478:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
479:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
480:   PetscInt          nonzerorow=0;

483:   VecSet(zz,zero);
484:   VecGetArrayRead(xx,&x);
485:   VecGetArray(zz,&z);

487:   v  = a->a;
488:   xb = x;

490:   for (i=0; i<mbs; i++) {
491:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
492:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
493:     ib          = aj + *ai;
494:     jmin        = 0;
495:     nonzerorow += (n>0);
496:     if (*ib == i) {      /* (diag of A)*x */
497:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
498:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
499:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
500:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
501:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
502:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
503:       v        += 36; jmin++;
504:     }
505:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
506:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
507:     for (j=jmin; j<n; j++) {
508:       /* (strict lower triangular part of A)*x  */
509:       cval       = ib[j]*6;
510:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
511:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
512:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
513:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
514:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
515:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
516:       /* (strict upper triangular part of A)*x  */
517:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
518:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
519:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
520:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
521:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
522:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
523:       v        += 36;
524:     }
525:     xb +=6; ai++;
526:   }

528:   VecRestoreArrayRead(xx,&x);
529:   VecRestoreArray(zz,&z);
530:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
531:   return(0);
532: }
535: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
536: {
537:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
538:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
539:   const PetscScalar *x,*xb;
540:   const MatScalar   *v;
541:   PetscErrorCode    ierr;
542:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
543:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
544:   PetscInt          nonzerorow=0;

547:   VecSet(zz,zero);
548:   VecGetArrayRead(xx,&x);
549:   VecGetArray(zz,&z);

551:   v  = a->a;
552:   xb = x;

554:   for (i=0; i<mbs; i++) {
555:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
556:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
557:     ib          = aj + *ai;
558:     jmin        = 0;
559:     nonzerorow += (n>0);
560:     if (*ib == i) {      /* (diag of A)*x */
561:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
562:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
563:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
564:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
565:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
566:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
567:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
568:       v        += 49; jmin++;
569:     }
570:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
571:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
572:     for (j=jmin; j<n; j++) {
573:       /* (strict lower triangular part of A)*x  */
574:       cval       = ib[j]*7;
575:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
576:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
577:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
578:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
579:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
580:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
581:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
582:       /* (strict upper triangular part of A)*x  */
583:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
584:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
585:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
586:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
587:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
588:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
589:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
590:       v       += 49;
591:     }
592:     xb +=7; ai++;
593:   }
594:   VecRestoreArrayRead(xx,&x);
595:   VecRestoreArray(zz,&z);
596:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
597:   return(0);
598: }

600: /*
601:     This will not work with MatScalar == float because it calls the BLAS
602: */
605: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
606: {
607:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
608:   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
609:   const PetscScalar *x,*x_ptr,*xb;
610:   const MatScalar   *v;
611:   PetscErrorCode    ierr;
612:   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
613:   const PetscInt    *idx,*aj,*ii;
614:   PetscInt          nonzerorow=0;

617:   VecSet(zz,zero);
618:   VecGetArrayRead(xx,&x);x_ptr = x;
619:   VecGetArray(zz,&z); z_ptr=z;

621:   aj = a->j;
622:   v  = a->a;
623:   ii = a->i;

625:   if (!a->mult_work) {
626:     PetscMalloc1(A->rmap->N+1,&a->mult_work);
627:   }
628:   work = a->mult_work;

630:   for (i=0; i<mbs; i++) {
631:     n           = ii[1] - ii[0]; ncols = n*bs;
632:     workt       = work; idx=aj+ii[0];
633:     nonzerorow += (n>0);

635:     /* upper triangular part */
636:     for (j=0; j<n; j++) {
637:       xb = x_ptr + bs*(*idx++);
638:       for (k=0; k<bs; k++) workt[k] = xb[k];
639:       workt += bs;
640:     }
641:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
642:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

644:     /* strict lower triangular part */
645:     idx = aj+ii[0];
646:     if (*idx == i) {
647:       ncols -= bs; v += bs2; idx++; n--;
648:     }

650:     if (ncols > 0) {
651:       workt = work;
652:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
653:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
654:       for (j=0; j<n; j++) {
655:         zb = z_ptr + bs*(*idx++);
656:         for (k=0; k<bs; k++) zb[k] += workt[k];
657:         workt += bs;
658:       }
659:     }
660:     x += bs; v += n*bs2; z += bs; ii++;
661:   }

663:   VecRestoreArrayRead(xx,&x);
664:   VecRestoreArray(zz,&z);
665:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
666:   return(0);
667: }

671: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
672: {
673:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
674:   PetscScalar       *z,x1;
675:   const PetscScalar *x,*xb;
676:   const MatScalar   *v;
677:   PetscErrorCode    ierr;
678:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
679:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
680:   PetscInt          nonzerorow=0;

683:   VecCopy(yy,zz);
684:   VecGetArrayRead(xx,&x);
685:   VecGetArray(zz,&z);
686:   v    = a->a;
687:   xb   = x;

689:   for (i=0; i<mbs; i++) {
690:     n           = ai[1] - ai[0]; /* length of i_th row of A */
691:     x1          = xb[0];
692:     ib          = aj + *ai;
693:     jmin        = 0;
694:     nonzerorow += (n>0);
695:     if (*ib == i) {            /* (diag of A)*x */
696:       z[i] += *v++ * x[*ib++]; jmin++;
697:     }
698:     for (j=jmin; j<n; j++) {
699:       cval    = *ib;
700:       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
701:       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
702:     }
703:     xb++; ai++;
704:   }

706:   VecRestoreArrayRead(xx,&x);
707:   VecRestoreArray(zz,&z);

709:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
710:   return(0);
711: }

715: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
716: {
717:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718:   PetscScalar       *z,x1,x2;
719:   const PetscScalar *x,*xb;
720:   const MatScalar   *v;
721:   PetscErrorCode    ierr;
722:   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
724:   PetscInt          nonzerorow=0;

727:   VecCopy(yy,zz);
728:   VecGetArrayRead(xx,&x);
729:   VecGetArray(zz,&z);

731:   v  = a->a;
732:   xb = x;

734:   for (i=0; i<mbs; i++) {
735:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
736:     x1          = xb[0]; x2 = xb[1];
737:     ib          = aj + *ai;
738:     jmin        = 0;
739:     nonzerorow += (n>0);
740:     if (*ib == i) {      /* (diag of A)*x */
741:       z[2*i]   += v[0]*x1 + v[2]*x2;
742:       z[2*i+1] += v[2]*x1 + v[3]*x2;
743:       v        += 4; jmin++;
744:     }
745:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
746:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
747:     for (j=jmin; j<n; j++) {
748:       /* (strict lower triangular part of A)*x  */
749:       cval       = ib[j]*2;
750:       z[cval]   += v[0]*x1 + v[1]*x2;
751:       z[cval+1] += v[2]*x1 + v[3]*x2;
752:       /* (strict upper triangular part of A)*x  */
753:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
754:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
755:       v        += 4;
756:     }
757:     xb +=2; ai++;
758:   }
759:   VecRestoreArrayRead(xx,&x);
760:   VecRestoreArray(zz,&z);

762:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
763:   return(0);
764: }

768: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
769: {
770:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
771:   PetscScalar       *z,x1,x2,x3;
772:   const PetscScalar *x,*xb;
773:   const MatScalar   *v;
774:   PetscErrorCode    ierr;
775:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
776:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
777:   PetscInt          nonzerorow=0;

780:   VecCopy(yy,zz);
781:   VecGetArrayRead(xx,&x);
782:   VecGetArray(zz,&z);

784:   v  = a->a;
785:   xb = x;

787:   for (i=0; i<mbs; i++) {
788:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
789:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
790:     ib          = aj + *ai;
791:     jmin        = 0;
792:     nonzerorow += (n>0);
793:     if (*ib == i) {     /* (diag of A)*x */
794:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
795:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
796:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
797:       v        += 9; jmin++;
798:     }
799:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
801:     for (j=jmin; j<n; j++) {
802:       /* (strict lower triangular part of A)*x  */
803:       cval       = ib[j]*3;
804:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
805:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
806:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
807:       /* (strict upper triangular part of A)*x  */
808:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
809:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
810:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
811:       v        += 9;
812:     }
813:     xb +=3; ai++;
814:   }

816:   VecRestoreArrayRead(xx,&x);
817:   VecRestoreArray(zz,&z);

819:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
820:   return(0);
821: }

825: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
826: {
827:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
828:   PetscScalar       *z,x1,x2,x3,x4;
829:   const PetscScalar *x,*xb;
830:   const MatScalar   *v;
831:   PetscErrorCode    ierr;
832:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
833:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
834:   PetscInt          nonzerorow=0;

837:   VecCopy(yy,zz);
838:   VecGetArrayRead(xx,&x);
839:   VecGetArray(zz,&z);

841:   v  = a->a;
842:   xb = x;

844:   for (i=0; i<mbs; i++) {
845:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
846:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
847:     ib          = aj + *ai;
848:     jmin        = 0;
849:     nonzerorow += (n>0);
850:     if (*ib == i) {      /* (diag of A)*x */
851:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
852:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
853:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
854:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
855:       v        += 16; jmin++;
856:     }
857:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
858:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
859:     for (j=jmin; j<n; j++) {
860:       /* (strict lower triangular part of A)*x  */
861:       cval       = ib[j]*4;
862:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
863:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
864:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
865:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
866:       /* (strict upper triangular part of A)*x  */
867:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
868:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
869:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
870:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
871:       v        += 16;
872:     }
873:     xb +=4; ai++;
874:   }

876:   VecRestoreArrayRead(xx,&x);
877:   VecRestoreArray(zz,&z);

879:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
880:   return(0);
881: }

885: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
886: {
887:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
888:   PetscScalar       *z,x1,x2,x3,x4,x5;
889:   const PetscScalar *x,*xb;
890:   const MatScalar   *v;
891:   PetscErrorCode    ierr;
892:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
893:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
894:   PetscInt          nonzerorow=0;

897:   VecCopy(yy,zz);
898:   VecGetArrayRead(xx,&x);
899:   VecGetArray(zz,&z);

901:   v  = a->a;
902:   xb = x;

904:   for (i=0; i<mbs; i++) {
905:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
906:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
907:     ib          = aj + *ai;
908:     jmin        = 0;
909:     nonzerorow += (n>0);
910:     if (*ib == i) {      /* (diag of A)*x */
911:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
912:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
913:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
914:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
915:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
916:       v        += 25; jmin++;
917:     }
918:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
919:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
920:     for (j=jmin; j<n; j++) {
921:       /* (strict lower triangular part of A)*x  */
922:       cval       = ib[j]*5;
923:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
924:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
925:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
926:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
927:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
928:       /* (strict upper triangular part of A)*x  */
929:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
930:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
931:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
932:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
933:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
934:       v        += 25;
935:     }
936:     xb +=5; ai++;
937:   }

939:   VecRestoreArrayRead(xx,&x);
940:   VecRestoreArray(zz,&z);

942:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
943:   return(0);
944: }
947: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
948: {
949:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
950:   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
951:   const PetscScalar *x,*xb;
952:   const MatScalar   *v;
953:   PetscErrorCode    ierr;
954:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
955:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
956:   PetscInt          nonzerorow=0;

959:   VecCopy(yy,zz);
960:   VecGetArrayRead(xx,&x);
961:   VecGetArray(zz,&z);

963:   v  = a->a;
964:   xb = x;

966:   for (i=0; i<mbs; i++) {
967:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
968:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
969:     ib          = aj + *ai;
970:     jmin        = 0;
971:     nonzerorow += (n>0);
972:     if (*ib == i) {     /* (diag of A)*x */
973:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
974:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
975:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
976:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
977:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
978:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
979:       v        += 36; jmin++;
980:     }
981:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
982:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
983:     for (j=jmin; j<n; j++) {
984:       /* (strict lower triangular part of A)*x  */
985:       cval       = ib[j]*6;
986:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
987:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
988:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
989:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
990:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
991:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
992:       /* (strict upper triangular part of A)*x  */
993:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
994:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
995:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
996:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
997:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
998:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
999:       v        += 36;
1000:     }
1001:     xb +=6; ai++;
1002:   }

1004:   VecRestoreArrayRead(xx,&x);
1005:   VecRestoreArray(zz,&z);

1007:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1008:   return(0);
1009: }

1013: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1014: {
1015:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1016:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1017:   const PetscScalar *x,*xb;
1018:   const MatScalar   *v;
1019:   PetscErrorCode    ierr;
1020:   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1021:   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1022:   PetscInt          nonzerorow=0;

1025:   VecCopy(yy,zz);
1026:   VecGetArrayRead(xx,&x);
1027:   VecGetArray(zz,&z);

1029:   v  = a->a;
1030:   xb = x;

1032:   for (i=0; i<mbs; i++) {
1033:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1034:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1035:     ib          = aj + *ai;
1036:     jmin        = 0;
1037:     nonzerorow += (n>0);
1038:     if (*ib == i) {     /* (diag of A)*x */
1039:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1040:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1041:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1042:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1043:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1044:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1045:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1046:       v        += 49; jmin++;
1047:     }
1048:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1049:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050:     for (j=jmin; j<n; j++) {
1051:       /* (strict lower triangular part of A)*x  */
1052:       cval       = ib[j]*7;
1053:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1054:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1055:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1056:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1057:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1058:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1059:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1060:       /* (strict upper triangular part of A)*x  */
1061:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1062:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1063:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1064:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1065:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1066:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1067:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1068:       v       += 49;
1069:     }
1070:     xb +=7; ai++;
1071:   }

1073:   VecRestoreArrayRead(xx,&x);
1074:   VecRestoreArray(zz,&z);

1076:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1077:   return(0);
1078: }

1082: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1083: {
1084:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1085:   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1086:   const PetscScalar *x,*x_ptr,*xb;
1087:   const MatScalar   *v;
1088:   PetscErrorCode    ierr;
1089:   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1090:   const PetscInt    *idx,*aj,*ii;
1091:   PetscInt          nonzerorow=0;

1094:   VecCopy(yy,zz);
1095:   VecGetArrayRead(xx,&x); x_ptr=x;
1096:   VecGetArray(zz,&z); z_ptr=z;

1098:   aj = a->j;
1099:   v  = a->a;
1100:   ii = a->i;

1102:   if (!a->mult_work) {
1103:     PetscMalloc1(A->rmap->n+1,&a->mult_work);
1104:   }
1105:   work = a->mult_work;


1108:   for (i=0; i<mbs; i++) {
1109:     n           = ii[1] - ii[0]; ncols = n*bs;
1110:     workt       = work; idx=aj+ii[0];
1111:     nonzerorow += (n>0);

1113:     /* upper triangular part */
1114:     for (j=0; j<n; j++) {
1115:       xb = x_ptr + bs*(*idx++);
1116:       for (k=0; k<bs; k++) workt[k] = xb[k];
1117:       workt += bs;
1118:     }
1119:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1120:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1122:     /* strict lower triangular part */
1123:     idx = aj+ii[0];
1124:     if (*idx == i) {
1125:       ncols -= bs; v += bs2; idx++; n--;
1126:     }
1127:     if (ncols > 0) {
1128:       workt = work;
1129:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
1130:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1131:       for (j=0; j<n; j++) {
1132:         zb = z_ptr + bs*(*idx++);
1133:         for (k=0; k<bs; k++) zb[k] += workt[k];
1134:         workt += bs;
1135:       }
1136:     }

1138:     x += bs; v += n*bs2; z += bs; ii++;
1139:   }

1141:   VecRestoreArrayRead(xx,&x);
1142:   VecRestoreArray(zz,&z);

1144:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1145:   return(0);
1146: }

1150: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1151: {
1152:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1153:   PetscScalar    oalpha = alpha;
1155:   PetscBLASInt   one = 1,totalnz;

1158:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1159:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1160:   PetscLogFlops(totalnz);
1161:   return(0);
1162: }

1166: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1167: {
1168:   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1169:   const MatScalar *v       = a->a;
1170:   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1171:   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1172:   PetscErrorCode  ierr;
1173:   const PetscInt  *aj=a->j,*col;

1176:   if (type == NORM_FROBENIUS) {
1177:     for (k=0; k<mbs; k++) {
1178:       jmin = a->i[k]; jmax = a->i[k+1];
1179:       col  = aj + jmin;
1180:       if (*col == k) {         /* diagonal block */
1181:         for (i=0; i<bs2; i++) {
1182:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1183:         }
1184:         jmin++;
1185:       }
1186:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1187:         for (i=0; i<bs2; i++) {
1188:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1189:         }
1190:       }
1191:     }
1192:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1193:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1194:     PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1195:     for (i=0; i<mbs; i++) jl[i] = mbs;
1196:     il[0] = 0;

1198:     *norm = 0.0;
1199:     for (k=0; k<mbs; k++) { /* k_th block row */
1200:       for (j=0; j<bs; j++) sum[j]=0.0;
1201:       /*-- col sum --*/
1202:       i = jl[k]; /* first |A(i,k)| to be added */
1203:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1204:                   at step k */
1205:       while (i<mbs) {
1206:         nexti = jl[i];  /* next block row to be added */
1207:         ik    = il[i];  /* block index of A(i,k) in the array a */
1208:         for (j=0; j<bs; j++) {
1209:           v = a->a + ik*bs2 + j*bs;
1210:           for (k1=0; k1<bs; k1++) {
1211:             sum[j] += PetscAbsScalar(*v); v++;
1212:           }
1213:         }
1214:         /* update il, jl */
1215:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1216:         jmax = a->i[i+1];
1217:         if (jmin < jmax) {
1218:           il[i] = jmin;
1219:           j     = a->j[jmin];
1220:           jl[i] = jl[j]; jl[j]=i;
1221:         }
1222:         i = nexti;
1223:       }
1224:       /*-- row sum --*/
1225:       jmin = a->i[k]; jmax = a->i[k+1];
1226:       for (i=jmin; i<jmax; i++) {
1227:         for (j=0; j<bs; j++) {
1228:           v = a->a + i*bs2 + j;
1229:           for (k1=0; k1<bs; k1++) {
1230:             sum[j] += PetscAbsScalar(*v); v += bs;
1231:           }
1232:         }
1233:       }
1234:       /* add k_th block row to il, jl */
1235:       col = aj+jmin;
1236:       if (*col == k) jmin++;
1237:       if (jmin < jmax) {
1238:         il[k] = jmin;
1239:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1240:       }
1241:       for (j=0; j<bs; j++) {
1242:         if (sum[j] > *norm) *norm = sum[j];
1243:       }
1244:     }
1245:     PetscFree3(sum,il,jl);
1246:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1247:   return(0);
1248: }

1252: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1253: {
1254:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1258:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1259:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1260:     *flg = PETSC_FALSE;
1261:     return(0);
1262:   }

1264:   /* if the a->i are the same */
1265:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1266:   if (!*flg) return(0);

1268:   /* if a->j are the same */
1269:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1270:   if (!*flg) return(0);

1272:   /* if a->a are the same */
1273:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1274:   return(0);
1275: }

1279: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1280: {
1281:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1282:   PetscErrorCode  ierr;
1283:   PetscInt        i,j,k,row,bs,ambs,bs2;
1284:   const PetscInt  *ai,*aj;
1285:   PetscScalar     *x,zero = 0.0;
1286:   const MatScalar *aa,*aa_j;

1289:   bs = A->rmap->bs;
1290:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");

1292:   aa   = a->a;
1293:   ambs = a->mbs;

1295:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1296:     PetscInt *diag=a->diag;
1297:     aa   = a->a;
1298:     ambs = a->mbs;
1299:     VecGetArray(v,&x);
1300:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1301:     VecRestoreArray(v,&x);
1302:     return(0);
1303:   }

1305:   ai   = a->i;
1306:   aj   = a->j;
1307:   bs2  = a->bs2;
1308:   VecSet(v,zero);
1309:   VecGetArray(v,&x);
1310:   for (i=0; i<ambs; i++) {
1311:     j=ai[i];
1312:     if (aj[j] == i) {    /* if this is a diagonal element */
1313:       row  = i*bs;
1314:       aa_j = aa + j*bs2;
1315:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1316:     }
1317:   }
1318:   VecRestoreArray(v,&x);
1319:   return(0);
1320: }

1324: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1325: {
1326:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1327:   PetscScalar       x;
1328:   const PetscScalar *l,*li,*ri;
1329:   MatScalar         *aa,*v;
1330:   PetscErrorCode    ierr;
1331:   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1332:   PetscBool         flg;

1335:   if (ll != rr) {
1336:     VecEqual(ll,rr,&flg);
1337:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1338:   }
1339:   if (!ll) return(0);
1340:   ai  = a->i;
1341:   aj  = a->j;
1342:   aa  = a->a;
1343:   m   = A->rmap->N;
1344:   bs  = A->rmap->bs;
1345:   mbs = a->mbs;
1346:   bs2 = a->bs2;

1348:   VecGetArrayRead(ll,&l);
1349:   VecGetLocalSize(ll,&lm);
1350:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1351:   for (i=0; i<mbs; i++) { /* for each block row */
1352:     M  = ai[i+1] - ai[i];
1353:     li = l + i*bs;
1354:     v  = aa + bs2*ai[i];
1355:     for (j=0; j<M; j++) { /* for each block */
1356:       ri = l + bs*aj[ai[i]+j];
1357:       for (k=0; k<bs; k++) {
1358:         x = ri[k];
1359:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1360:       }
1361:     }
1362:   }
1363:   VecRestoreArrayRead(ll,&l);
1364:   PetscLogFlops(2.0*a->nz);
1365:   return(0);
1366: }

1370: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1371: {
1372:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1375:   info->block_size   = a->bs2;
1376:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1377:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1378:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1379:   info->assemblies   = A->num_ass;
1380:   info->mallocs      = A->info.mallocs;
1381:   info->memory       = ((PetscObject)A)->mem;
1382:   if (A->factortype) {
1383:     info->fill_ratio_given  = A->info.fill_ratio_given;
1384:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1385:     info->factor_mallocs    = A->info.factor_mallocs;
1386:   } else {
1387:     info->fill_ratio_given  = 0;
1388:     info->fill_ratio_needed = 0;
1389:     info->factor_mallocs    = 0;
1390:   }
1391:   return(0);
1392: }


1397: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1398: {
1399:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1403:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1404:   return(0);
1405: }

1409: /*
1410:    This code does not work since it only checks the upper triangular part of
1411:   the matrix. Hence it is not listed in the function table.
1412: */
1413: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1414: {
1415:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1416:   PetscErrorCode  ierr;
1417:   PetscInt        i,j,n,row,col,bs,mbs;
1418:   const PetscInt  *ai,*aj;
1419:   PetscReal       atmp;
1420:   const MatScalar *aa;
1421:   PetscScalar     *x;
1422:   PetscInt        ncols,brow,bcol,krow,kcol;

1425:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1426:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1427:   bs  = A->rmap->bs;
1428:   aa  = a->a;
1429:   ai  = a->i;
1430:   aj  = a->j;
1431:   mbs = a->mbs;

1433:   VecSet(v,0.0);
1434:   VecGetArray(v,&x);
1435:   VecGetLocalSize(v,&n);
1436:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1437:   for (i=0; i<mbs; i++) {
1438:     ncols = ai[1] - ai[0]; ai++;
1439:     brow  = bs*i;
1440:     for (j=0; j<ncols; j++) {
1441:       bcol = bs*(*aj);
1442:       for (kcol=0; kcol<bs; kcol++) {
1443:         col = bcol + kcol;      /* col index */
1444:         for (krow=0; krow<bs; krow++) {
1445:           atmp = PetscAbsScalar(*aa); aa++;
1446:           row  = brow + krow;   /* row index */
1447:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1448:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1449:         }
1450:       }
1451:       aj++;
1452:     }
1453:   }
1454:   VecRestoreArray(v,&x);
1455:   return(0);
1456: }