Actual source code: sbaij2.c

petsc-3.4.5 2014-06-29
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <petsc-private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6: #include <petscblaslapack.h>

 10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 14:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 15:   const PetscInt *idx;
 16:   PetscBT        table_out,table_in;

 19:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 20:   mbs  = a->mbs;
 21:   ai   = a->i;
 22:   aj   = a->j;
 23:   bs   = A->rmap->bs;
 24:   PetscBTCreate(mbs,&table_out);
 25:   PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);
 26:   PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
 27:   PetscBTCreate(mbs,&table_in);

 29:   for (i=0; i<is_max; i++) { /* for each is */
 30:     isz  = 0;
 31:     PetscBTMemzero(mbs,table_out);

 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 38:     bcol_max = 0;
 39:     for (j=0; j<n; ++j) {
 40:       brow = idx[j]/bs; /* convert the indices into block indices */
 41:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if (!PetscBTLookupSet(table_out,brow)) {
 43:         nidx[isz++] = brow;
 44:         if (bcol_max < brow) bcol_max = brow;
 45:       }
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);

 50:     k = 0;
 51:     for (j=0; j<ov; j++) { /* for each overlap */
 52:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 53:       PetscBTMemzero(mbs,table_in);
 54:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 56:       n = isz;  /* length of the updated is[i] */
 57:       for (brow=0; brow<mbs; brow++) {
 58:         start = ai[brow]; end   = ai[brow+1];
 59:         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 60:           for (l = start; l<end; l++) {
 61:             bcol = aj[l];
 62:             if (!PetscBTLookupSet(table_out,bcol)) {
 63:               nidx[isz++] = bcol;
 64:               if (bcol_max < bcol) bcol_max = bcol;
 65:             }
 66:           }
 67:           k++;
 68:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 69:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 70:           for (l = start; l<end; l++) {
 71:             bcol = aj[l];
 72:             if (bcol > bcol_max) break;
 73:             if (PetscBTLookup(table_in,bcol)) {
 74:               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
 75:               break; /* for l = start; l<end ; l++) */
 76:             }
 77:           }
 78:         }
 79:       }
 80:     } /* for each overlap */

 82:     /* expand the Index Set */
 83:     for (j=0; j<isz; j++) {
 84:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 85:     }
 86:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 87:   }
 88:   PetscBTDestroy(&table_out);
 89:   PetscFree(nidx);
 90:   PetscFree(nidx2);
 91:   PetscBTDestroy(&table_in);
 92:   return(0);
 93: }

 97: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 98: {
 99:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
101:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
103:   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104:   const PetscInt *irow,*aj = a->j,*ai = a->i;
105:   MatScalar      *mat_a;
106:   Mat            C;
107:   PetscBool      flag,sorted;

110:   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
111:   ISSorted(iscol,&sorted);
112:   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

114:   ISGetIndices(isrow,&irow);
115:   ISGetSize(isrow,&nrows);

117:   PetscMalloc(oldcols*sizeof(PetscInt),&smap);
118:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
119:   ssmap = smap;
120:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
121:   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122:   /* determine lens of each row */
123:   for (i=0; i<nrows; i++) {
124:     kstart  = ai[irow[i]];
125:     kend    = kstart + a->ilen[irow[i]];
126:     lens[i] = 0;
127:     for (k=kstart; k<kend; k++) {
128:       if (ssmap[aj[k]]) lens[i]++;
129:     }
130:   }
131:   /* Create and fill new matrix */
132:   if (scall == MAT_REUSE_MATRIX) {
133:     c = (Mat_SeqSBAIJ*)((*B)->data);

135:     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
137:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
138:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
139:     C    = *B;
140:   } else {
141:     MatCreate(PetscObjectComm((PetscObject)A),&C);
142:     MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);
143:     MatSetType(C,((PetscObject)A)->type_name);
144:     MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
145:   }
146:   c = (Mat_SeqSBAIJ*)(C->data);
147:   for (i=0; i<nrows; i++) {
148:     row      = irow[i];
149:     kstart   = ai[row];
150:     kend     = kstart + a->ilen[row];
151:     mat_i    = c->i[i];
152:     mat_j    = c->j + mat_i;
153:     mat_a    = c->a + mat_i*bs2;
154:     mat_ilen = c->ilen + i;
155:     for (k=kstart; k<kend; k++) {
156:       if ((tcol=ssmap[a->j[k]])) {
157:         *mat_j++ = tcol - 1;
158:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
159:         mat_a   += bs2;
160:         (*mat_ilen)++;
161:       }
162:     }
163:   }

165:   /* Free work space */
166:   PetscFree(smap);
167:   PetscFree(lens);
168:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
169:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

171:   ISRestoreIndices(isrow,&irow);
172:   *B   = C;
173:   return(0);
174: }

178: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
179: {
180:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
181:   IS             is1;
183:   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
184:   const PetscInt *irow;

187:   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");

189:   ISGetIndices(isrow,&irow);
190:   ISGetSize(isrow,&nrows);

192:   /* Verify if the indices corespond to each element in a block
193:    and form the IS with compressed IS */
194:   PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
195:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
196:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;

198:   count = 0;
199:   for (i=0; i<a->mbs; i++) {
200:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
201:     if (vary[i]==bs) iary[count++] = i;
202:   }
203:   ISRestoreIndices(isrow,&irow);
204:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
205:   PetscFree2(vary,iary);

207:   MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);
208:   ISDestroy(&is1);
209:   return(0);
210: }

214: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
215: {
217:   PetscInt       i;

220:   if (scall == MAT_INITIAL_MATRIX) {
221:     PetscMalloc((n+1)*sizeof(Mat),B);
222:   }

224:   for (i=0; i<n; i++) {
225:     MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
226:   }
227:   return(0);
228: }

230: /* -------------------------------------------------------*/
231: /* Should check that shapes of vectors and matrices match */
232: /* -------------------------------------------------------*/

236: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
237: {
238:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
239:   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
240:   MatScalar      *v;
242:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
243:   PetscInt       nonzerorow=0;

246:   VecSet(zz,zero);
247:   VecGetArray(xx,&x);
248:   VecGetArray(zz,&z);

250:   v  = a->a;
251:   xb = x;

253:   for (i=0; i<mbs; i++) {
254:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
255:     x1          = xb[0]; x2 = xb[1];
256:     ib          = aj + *ai;
257:     jmin        = 0;
258:     nonzerorow += (n>0);
259:     if (*ib == i) {     /* (diag of A)*x */
260:       z[2*i]   += v[0]*x1 + v[2]*x2;
261:       z[2*i+1] += v[2]*x1 + v[3]*x2;
262:       v        += 4; jmin++;
263:     }
264:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
265:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
266:     for (j=jmin; j<n; j++) {
267:       /* (strict lower triangular part of A)*x  */
268:       cval       = ib[j]*2;
269:       z[cval]   += v[0]*x1 + v[1]*x2;
270:       z[cval+1] += v[2]*x1 + v[3]*x2;
271:       /* (strict upper triangular part of A)*x  */
272:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
273:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
274:       v        += 4;
275:     }
276:     xb +=2; ai++;
277:   }

279:   VecRestoreArray(xx,&x);
280:   VecRestoreArray(zz,&z);
281:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
282:   return(0);
283: }

287: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
288: {
289:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
290:   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
291:   MatScalar      *v;
293:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
294:   PetscInt       nonzerorow=0;

297:   VecSet(zz,zero);
298:   VecGetArray(xx,&x);
299:   VecGetArray(zz,&z);

301:   v  = a->a;
302:   xb = x;

304:   for (i=0; i<mbs; i++) {
305:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
306:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
307:     ib          = aj + *ai;
308:     jmin        = 0;
309:     nonzerorow += (n>0);
310:     if (*ib == i) {     /* (diag of A)*x */
311:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
312:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
313:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
314:       v        += 9; jmin++;
315:     }
316:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
317:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
318:     for (j=jmin; j<n; j++) {
319:       /* (strict lower triangular part of A)*x  */
320:       cval       = ib[j]*3;
321:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
322:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
323:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
324:       /* (strict upper triangular part of A)*x  */
325:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
326:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
327:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
328:       v        += 9;
329:     }
330:     xb +=3; ai++;
331:   }

333:   VecRestoreArray(xx,&x);
334:   VecRestoreArray(zz,&z);
335:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
336:   return(0);
337: }

341: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
342: {
343:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
344:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
345:   MatScalar      *v;
347:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
348:   PetscInt       nonzerorow=0;

351:   VecSet(zz,zero);
352:   VecGetArray(xx,&x);
353:   VecGetArray(zz,&z);

355:   v  = a->a;
356:   xb = x;

358:   for (i=0; i<mbs; i++) {
359:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
360:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
361:     ib          = aj + *ai;
362:     jmin        = 0;
363:     nonzerorow += (n>0);
364:     if (*ib == i) {     /* (diag of A)*x */
365:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
366:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
367:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
368:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
369:       v        += 16; jmin++;
370:     }
371:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
373:     for (j=jmin; j<n; j++) {
374:       /* (strict lower triangular part of A)*x  */
375:       cval       = ib[j]*4;
376:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
377:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
378:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
379:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
380:       /* (strict upper triangular part of A)*x  */
381:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
382:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
383:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
384:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
385:       v        += 16;
386:     }
387:     xb +=4; ai++;
388:   }

390:   VecRestoreArray(xx,&x);
391:   VecRestoreArray(zz,&z);
392:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
393:   return(0);
394: }

398: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
399: {
400:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
401:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
402:   MatScalar      *v;
404:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
405:   PetscInt       nonzerorow=0;

408:   VecSet(zz,zero);
409:   VecGetArray(xx,&x);
410:   VecGetArray(zz,&z);

412:   v  = a->a;
413:   xb = x;

415:   for (i=0; i<mbs; i++) {
416:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
418:     ib          = aj + *ai;
419:     jmin        = 0;
420:     nonzerorow += (n>0);
421:     if (*ib == i) {      /* (diag of A)*x */
422:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
423:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
424:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
425:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
426:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
427:       v        += 25; jmin++;
428:     }
429:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
430:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431:     for (j=jmin; j<n; j++) {
432:       /* (strict lower triangular part of A)*x  */
433:       cval       = ib[j]*5;
434:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
435:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
436:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
437:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
438:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
439:       /* (strict upper triangular part of A)*x  */
440:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
441:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
442:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
443:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
444:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
445:       v        += 25;
446:     }
447:     xb +=5; ai++;
448:   }

450:   VecRestoreArray(xx,&x);
451:   VecRestoreArray(zz,&z);
452:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
453:   return(0);
454: }


459: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
460: {
461:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
462:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
463:   MatScalar      *v;
465:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
466:   PetscInt       nonzerorow=0;

469:   VecSet(zz,zero);
470:   VecGetArray(xx,&x);
471:   VecGetArray(zz,&z);

473:   v  = a->a;
474:   xb = x;

476:   for (i=0; i<mbs; i++) {
477:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
478:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
479:     ib          = aj + *ai;
480:     jmin        = 0;
481:     nonzerorow += (n>0);
482:     if (*ib == i) {      /* (diag of A)*x */
483:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
484:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
485:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
486:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
487:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
488:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
489:       v        += 36; jmin++;
490:     }
491:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493:     for (j=jmin; j<n; j++) {
494:       /* (strict lower triangular part of A)*x  */
495:       cval       = ib[j]*6;
496:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
497:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
498:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
499:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
500:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
501:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
502:       /* (strict upper triangular part of A)*x  */
503:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
504:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
505:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
506:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
507:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
508:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
509:       v        += 36;
510:     }
511:     xb +=6; ai++;
512:   }

514:   VecRestoreArray(xx,&x);
515:   VecRestoreArray(zz,&z);
516:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
517:   return(0);
518: }
521: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
522: {
523:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
524:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
525:   MatScalar      *v;
527:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
528:   PetscInt       nonzerorow=0;

531:   VecSet(zz,zero);
532:   VecGetArray(xx,&x);
533:   VecGetArray(zz,&z);

535:   v  = a->a;
536:   xb = x;

538:   for (i=0; i<mbs; i++) {
539:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
540:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
541:     ib          = aj + *ai;
542:     jmin        = 0;
543:     nonzerorow += (n>0);
544:     if (*ib == i) {      /* (diag of A)*x */
545:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
546:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
547:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
548:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
549:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
550:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
551:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
552:       v        += 49; jmin++;
553:     }
554:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
555:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
556:     for (j=jmin; j<n; j++) {
557:       /* (strict lower triangular part of A)*x  */
558:       cval       = ib[j]*7;
559:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
560:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
561:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
562:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
563:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
564:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
565:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
566:       /* (strict upper triangular part of A)*x  */
567:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
568:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
569:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
570:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
571:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
572:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
573:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
574:       v       += 49;
575:     }
576:     xb +=7; ai++;
577:   }
578:   VecRestoreArray(xx,&x);
579:   VecRestoreArray(zz,&z);
580:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
581:   return(0);
582: }

584: /*
585:     This will not work with MatScalar == float because it calls the BLAS
586: */
589: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
590: {
591:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
592:   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
593:   MatScalar      *v;
595:   PetscInt       mbs       =a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
596:   PetscInt       nonzerorow=0;

599:   VecSet(zz,zero);
600:   VecGetArray(xx,&x); x_ptr=x;
601:   VecGetArray(zz,&z); z_ptr=z;

603:   aj = a->j;
604:   v  = a->a;
605:   ii = a->i;

607:   if (!a->mult_work) {
608:     PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);
609:   }
610:   work = a->mult_work;

612:   for (i=0; i<mbs; i++) {
613:     n           = ii[1] - ii[0]; ncols = n*bs;
614:     workt       = work; idx=aj+ii[0];
615:     nonzerorow += (n>0);

617:     /* upper triangular part */
618:     for (j=0; j<n; j++) {
619:       xb = x_ptr + bs*(*idx++);
620:       for (k=0; k<bs; k++) workt[k] = xb[k];
621:       workt += bs;
622:     }
623:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
624:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

626:     /* strict lower triangular part */
627:     idx = aj+ii[0];
628:     if (*idx == i) {
629:       ncols -= bs; v += bs2; idx++; n--;
630:     }

632:     if (ncols > 0) {
633:       workt = work;
634:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
635:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
636:       for (j=0; j<n; j++) {
637:         zb = z_ptr + bs*(*idx++);
638:         for (k=0; k<bs; k++) zb[k] += workt[k];
639:         workt += bs;
640:       }
641:     }
642:     x += bs; v += n*bs2; z += bs; ii++;
643:   }

645:   VecRestoreArray(xx,&x);
646:   VecRestoreArray(zz,&z);
647:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
648:   return(0);
649: }

653: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
654: {
655:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
656:   PetscScalar    *x,*z,*xb,x1;
657:   MatScalar      *v;
659:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
660:   PetscInt       nonzerorow=0;

663:   VecCopy(yy,zz);
664:   VecGetArray(xx,&x);
665:   VecGetArray(zz,&z);
666:   v    = a->a;
667:   xb   = x;

669:   for (i=0; i<mbs; i++) {
670:     n           = ai[1] - ai[0]; /* length of i_th row of A */
671:     x1          = xb[0];
672:     ib          = aj + *ai;
673:     jmin        = 0;
674:     nonzerorow += (n>0);
675:     if (*ib == i) {            /* (diag of A)*x */
676:       z[i] += *v++ * x[*ib++]; jmin++;
677:     }
678:     for (j=jmin; j<n; j++) {
679:       cval    = *ib;
680:       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
681:       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
682:     }
683:     xb++; ai++;
684:   }

686:   VecRestoreArray(xx,&x);
687:   VecRestoreArray(zz,&z);

689:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
690:   return(0);
691: }

695: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
696: {
697:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
698:   PetscScalar    *x,*z,*xb,x1,x2;
699:   MatScalar      *v;
701:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
702:   PetscInt       nonzerorow=0;

705:   VecCopy(yy,zz);
706:   VecGetArray(xx,&x);
707:   VecGetArray(zz,&z);

709:   v  = a->a;
710:   xb = x;

712:   for (i=0; i<mbs; i++) {
713:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
714:     x1          = xb[0]; x2 = xb[1];
715:     ib          = aj + *ai;
716:     jmin        = 0;
717:     nonzerorow += (n>0);
718:     if (*ib == i) {      /* (diag of A)*x */
719:       z[2*i]   += v[0]*x1 + v[2]*x2;
720:       z[2*i+1] += v[2]*x1 + v[3]*x2;
721:       v        += 4; jmin++;
722:     }
723:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
724:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
725:     for (j=jmin; j<n; j++) {
726:       /* (strict lower triangular part of A)*x  */
727:       cval       = ib[j]*2;
728:       z[cval]   += v[0]*x1 + v[1]*x2;
729:       z[cval+1] += v[2]*x1 + v[3]*x2;
730:       /* (strict upper triangular part of A)*x  */
731:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
732:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
733:       v        += 4;
734:     }
735:     xb +=2; ai++;
736:   }
737:   VecRestoreArray(xx,&x);
738:   VecRestoreArray(zz,&z);

740:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
741:   return(0);
742: }

746: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
747: {
748:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
749:   PetscScalar    *x,*z,*xb,x1,x2,x3;
750:   MatScalar      *v;
752:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
753:   PetscInt       nonzerorow=0;

756:   VecCopy(yy,zz);
757:   VecGetArray(xx,&x);
758:   VecGetArray(zz,&z);

760:   v  = a->a;
761:   xb = x;

763:   for (i=0; i<mbs; i++) {
764:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
765:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
766:     ib          = aj + *ai;
767:     jmin        = 0;
768:     nonzerorow += (n>0);
769:     if (*ib == i) {     /* (diag of A)*x */
770:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
771:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
772:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
773:       v        += 9; jmin++;
774:     }
775:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
776:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
777:     for (j=jmin; j<n; j++) {
778:       /* (strict lower triangular part of A)*x  */
779:       cval       = ib[j]*3;
780:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
781:       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
782:       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
783:       /* (strict upper triangular part of A)*x  */
784:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
785:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
786:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
787:       v        += 9;
788:     }
789:     xb +=3; ai++;
790:   }

792:   VecRestoreArray(xx,&x);
793:   VecRestoreArray(zz,&z);

795:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
796:   return(0);
797: }

801: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
802: {
803:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
804:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
805:   MatScalar      *v;
807:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
808:   PetscInt       nonzerorow=0;

811:   VecCopy(yy,zz);
812:   VecGetArray(xx,&x);
813:   VecGetArray(zz,&z);

815:   v  = a->a;
816:   xb = x;

818:   for (i=0; i<mbs; i++) {
819:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
820:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
821:     ib          = aj + *ai;
822:     jmin        = 0;
823:     nonzerorow += (n>0);
824:     if (*ib == i) {      /* (diag of A)*x */
825:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
826:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
827:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
828:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
829:       v        += 16; jmin++;
830:     }
831:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833:     for (j=jmin; j<n; j++) {
834:       /* (strict lower triangular part of A)*x  */
835:       cval       = ib[j]*4;
836:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
837:       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
838:       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
839:       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
840:       /* (strict upper triangular part of A)*x  */
841:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
842:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
843:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
844:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
845:       v        += 16;
846:     }
847:     xb +=4; ai++;
848:   }

850:   VecRestoreArray(xx,&x);
851:   VecRestoreArray(zz,&z);

853:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
854:   return(0);
855: }

859: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
860: {
861:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
862:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
863:   MatScalar      *v;
865:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
866:   PetscInt       nonzerorow=0;

869:   VecCopy(yy,zz);
870:   VecGetArray(xx,&x);
871:   VecGetArray(zz,&z);

873:   v  = a->a;
874:   xb = x;

876:   for (i=0; i<mbs; i++) {
877:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
878:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
879:     ib          = aj + *ai;
880:     jmin        = 0;
881:     nonzerorow += (n>0);
882:     if (*ib == i) {      /* (diag of A)*x */
883:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
884:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
885:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
886:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
887:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
888:       v        += 25; jmin++;
889:     }
890:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
891:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
892:     for (j=jmin; j<n; j++) {
893:       /* (strict lower triangular part of A)*x  */
894:       cval       = ib[j]*5;
895:       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
896:       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
897:       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
898:       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
899:       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
900:       /* (strict upper triangular part of A)*x  */
901:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
902:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
903:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
904:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
905:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
906:       v        += 25;
907:     }
908:     xb +=5; ai++;
909:   }

911:   VecRestoreArray(xx,&x);
912:   VecRestoreArray(zz,&z);

914:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
915:   return(0);
916: }
919: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
920: {
921:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
922:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
923:   MatScalar      *v;
925:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
926:   PetscInt       nonzerorow=0;

929:   VecCopy(yy,zz);
930:   VecGetArray(xx,&x);
931:   VecGetArray(zz,&z);

933:   v  = a->a;
934:   xb = x;

936:   for (i=0; i<mbs; i++) {
937:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
938:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
939:     ib          = aj + *ai;
940:     jmin        = 0;
941:     nonzerorow += (n>0);
942:     if (*ib == i) {     /* (diag of A)*x */
943:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
944:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
945:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
946:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
947:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
948:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
949:       v        += 36; jmin++;
950:     }
951:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
952:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
953:     for (j=jmin; j<n; j++) {
954:       /* (strict lower triangular part of A)*x  */
955:       cval       = ib[j]*6;
956:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
957:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
958:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
959:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
960:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
961:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
962:       /* (strict upper triangular part of A)*x  */
963:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
964:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
965:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
966:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
967:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
968:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
969:       v        += 36;
970:     }
971:     xb +=6; ai++;
972:   }

974:   VecRestoreArray(xx,&x);
975:   VecRestoreArray(zz,&z);

977:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
978:   return(0);
979: }

983: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
984: {
985:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
986:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
987:   MatScalar      *v;
989:   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
990:   PetscInt       nonzerorow=0;

993:   VecCopy(yy,zz);
994:   VecGetArray(xx,&x);
995:   VecGetArray(zz,&z);

997:   v  = a->a;
998:   xb = x;

1000:   for (i=0; i<mbs; i++) {
1001:     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1002:     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1003:     ib          = aj + *ai;
1004:     jmin        = 0;
1005:     nonzerorow += (n>0);
1006:     if (*ib == i) {     /* (diag of A)*x */
1007:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1008:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1009:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1010:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1011:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1012:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1013:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1014:       v        += 49; jmin++;
1015:     }
1016:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1017:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1018:     for (j=jmin; j<n; j++) {
1019:       /* (strict lower triangular part of A)*x  */
1020:       cval       = ib[j]*7;
1021:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1022:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1023:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1024:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1025:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1026:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1027:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1028:       /* (strict upper triangular part of A)*x  */
1029:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1030:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1031:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1032:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1033:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1034:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1035:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1036:       v       += 49;
1037:     }
1038:     xb +=7; ai++;
1039:   }

1041:   VecRestoreArray(xx,&x);
1042:   VecRestoreArray(zz,&z);

1044:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1045:   return(0);
1046: }

1050: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1051: {
1052:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1053:   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1054:   MatScalar      *v;
1056:   PetscInt       mbs       =a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1057:   PetscInt       nonzerorow=0;

1060:   VecCopy(yy,zz);
1061:   VecGetArray(xx,&x); x_ptr=x;
1062:   VecGetArray(zz,&z); z_ptr=z;

1064:   aj = a->j;
1065:   v  = a->a;
1066:   ii = a->i;

1068:   if (!a->mult_work) {
1069:     PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);
1070:   }
1071:   work = a->mult_work;


1074:   for (i=0; i<mbs; i++) {
1075:     n           = ii[1] - ii[0]; ncols = n*bs;
1076:     workt       = work; idx=aj+ii[0];
1077:     nonzerorow += (n>0);

1079:     /* upper triangular part */
1080:     for (j=0; j<n; j++) {
1081:       xb = x_ptr + bs*(*idx++);
1082:       for (k=0; k<bs; k++) workt[k] = xb[k];
1083:       workt += bs;
1084:     }
1085:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1086:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1088:     /* strict lower triangular part */
1089:     idx = aj+ii[0];
1090:     if (*idx == i) {
1091:       ncols -= bs; v += bs2; idx++; n--;
1092:     }
1093:     if (ncols > 0) {
1094:       workt = work;
1095:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
1096:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1097:       for (j=0; j<n; j++) {
1098:         zb = z_ptr + bs*(*idx++);
1099:         for (k=0; k<bs; k++) zb[k] += workt[k];
1100:         workt += bs;
1101:       }
1102:     }

1104:     x += bs; v += n*bs2; z += bs; ii++;
1105:   }

1107:   VecRestoreArray(xx,&x);
1108:   VecRestoreArray(zz,&z);

1110:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1111:   return(0);
1112: }

1116: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1117: {
1118:   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1119:   PetscScalar    oalpha = alpha;
1121:   PetscBLASInt   one = 1,totalnz;

1124:   PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1125:   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1126:   PetscLogFlops(totalnz);
1127:   return(0);
1128: }

1132: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1133: {
1134:   Mat_SeqSBAIJ   *a       = (Mat_SeqSBAIJ*)A->data;
1135:   MatScalar      *v       = a->a;
1136:   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1137:   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1139:   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;

1142:   if (type == NORM_FROBENIUS) {
1143:     for (k=0; k<mbs; k++) {
1144:       jmin = a->i[k]; jmax = a->i[k+1];
1145:       col  = aj + jmin;
1146:       if (*col == k) {         /* diagonal block */
1147:         for (i=0; i<bs2; i++) {
1148:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1149:         }
1150:         jmin++;
1151:       }
1152:       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1153:         for (i=0; i<bs2; i++) {
1154:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1155:         }
1156:       }
1157:     }
1158:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1159:   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1160:     PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);
1161:     for (i=0; i<mbs; i++) jl[i] = mbs;
1162:     il[0] = 0;

1164:     *norm = 0.0;
1165:     for (k=0; k<mbs; k++) { /* k_th block row */
1166:       for (j=0; j<bs; j++) sum[j]=0.0;
1167:       /*-- col sum --*/
1168:       i = jl[k]; /* first |A(i,k)| to be added */
1169:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1170:                   at step k */
1171:       while (i<mbs) {
1172:         nexti = jl[i];  /* next block row to be added */
1173:         ik    = il[i];  /* block index of A(i,k) in the array a */
1174:         for (j=0; j<bs; j++) {
1175:           v = a->a + ik*bs2 + j*bs;
1176:           for (k1=0; k1<bs; k1++) {
1177:             sum[j] += PetscAbsScalar(*v); v++;
1178:           }
1179:         }
1180:         /* update il, jl */
1181:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1182:         jmax = a->i[i+1];
1183:         if (jmin < jmax) {
1184:           il[i] = jmin;
1185:           j     = a->j[jmin];
1186:           jl[i] = jl[j]; jl[j]=i;
1187:         }
1188:         i = nexti;
1189:       }
1190:       /*-- row sum --*/
1191:       jmin = a->i[k]; jmax = a->i[k+1];
1192:       for (i=jmin; i<jmax; i++) {
1193:         for (j=0; j<bs; j++) {
1194:           v = a->a + i*bs2 + j;
1195:           for (k1=0; k1<bs; k1++) {
1196:             sum[j] += PetscAbsScalar(*v); v += bs;
1197:           }
1198:         }
1199:       }
1200:       /* add k_th block row to il, jl */
1201:       col = aj+jmin;
1202:       if (*col == k) jmin++;
1203:       if (jmin < jmax) {
1204:         il[k] = jmin;
1205:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1206:       }
1207:       for (j=0; j<bs; j++) {
1208:         if (sum[j] > *norm) *norm = sum[j];
1209:       }
1210:     }
1211:     PetscFree3(sum,il,jl);
1212:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1213:   return(0);
1214: }

1218: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1219: {
1220:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;

1224:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1225:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1226:     *flg = PETSC_FALSE;
1227:     return(0);
1228:   }

1230:   /* if the a->i are the same */
1231:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1232:   if (!*flg) return(0);

1234:   /* if a->j are the same */
1235:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1236:   if (!*flg) return(0);

1238:   /* if a->a are the same */
1239:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1240:   return(0);
1241: }

1245: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1246: {
1247:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1249:   PetscInt       i,j,k,row,bs,*ai,*aj,ambs,bs2;
1250:   PetscScalar    *x,zero = 0.0;
1251:   MatScalar      *aa,*aa_j;

1254:   bs = A->rmap->bs;
1255:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");

1257:   aa   = a->a;
1258:   ambs = a->mbs;

1260:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1261:     PetscInt *diag=a->diag;
1262:     aa   = a->a;
1263:     ambs = a->mbs;
1264:     VecGetArray(v,&x);
1265:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1266:     VecRestoreArray(v,&x);
1267:     return(0);
1268:   }

1270:   ai   = a->i;
1271:   aj   = a->j;
1272:   bs2  = a->bs2;
1273:   VecSet(v,zero);
1274:   VecGetArray(v,&x);
1275:   for (i=0; i<ambs; i++) {
1276:     j=ai[i];
1277:     if (aj[j] == i) {    /* if this is a diagonal element */
1278:       row  = i*bs;
1279:       aa_j = aa + j*bs2;
1280:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1281:     }
1282:   }
1283:   VecRestoreArray(v,&x);
1284:   return(0);
1285: }

1289: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1290: {
1291:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1292:   PetscScalar    *l,x,*li,*ri;
1293:   MatScalar      *aa,*v;
1295:   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1296:   PetscBool      flg;

1299:   if (ll != rr) {
1300:     VecEqual(ll,rr,&flg);
1301:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1302:   }
1303:   if (!ll) return(0);
1304:   ai  = a->i;
1305:   aj  = a->j;
1306:   aa  = a->a;
1307:   m   = A->rmap->N;
1308:   bs  = A->rmap->bs;
1309:   mbs = a->mbs;
1310:   bs2 = a->bs2;

1312:   VecGetArray(ll,&l);
1313:   VecGetLocalSize(ll,&lm);
1314:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1315:   for (i=0; i<mbs; i++) { /* for each block row */
1316:     M  = ai[i+1] - ai[i];
1317:     li = l + i*bs;
1318:     v  = aa + bs2*ai[i];
1319:     for (j=0; j<M; j++) { /* for each block */
1320:       ri = l + bs*aj[ai[i]+j];
1321:       for (k=0; k<bs; k++) {
1322:         x = ri[k];
1323:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1324:       }
1325:     }
1326:   }
1327:   VecRestoreArray(ll,&l);
1328:   PetscLogFlops(2.0*a->nz);
1329:   return(0);
1330: }

1334: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1335: {
1336:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1339:   info->block_size   = a->bs2;
1340:   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1341:   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1342:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1343:   info->assemblies   = A->num_ass;
1344:   info->mallocs      = A->info.mallocs;
1345:   info->memory       = ((PetscObject)A)->mem;
1346:   if (A->factortype) {
1347:     info->fill_ratio_given  = A->info.fill_ratio_given;
1348:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1349:     info->factor_mallocs    = A->info.factor_mallocs;
1350:   } else {
1351:     info->fill_ratio_given  = 0;
1352:     info->fill_ratio_needed = 0;
1353:     info->factor_mallocs    = 0;
1354:   }
1355:   return(0);
1356: }


1361: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1362: {
1363:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1367:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1368:   return(0);
1369: }

1373: /*
1374:    This code does not work since it only checks the upper triangular part of
1375:   the matrix. Hence it is not listed in the function table.
1376: */
1377: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1378: {
1379:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1381:   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1382:   PetscReal      atmp;
1383:   MatScalar      *aa;
1384:   PetscScalar    *x;
1385:   PetscInt       ncols,brow,bcol,krow,kcol;

1388:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1389:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1390:   bs  = A->rmap->bs;
1391:   aa  = a->a;
1392:   ai  = a->i;
1393:   aj  = a->j;
1394:   mbs = a->mbs;

1396:   VecSet(v,0.0);
1397:   VecGetArray(v,&x);
1398:   VecGetLocalSize(v,&n);
1399:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1400:   for (i=0; i<mbs; i++) {
1401:     ncols = ai[1] - ai[0]; ai++;
1402:     brow  = bs*i;
1403:     for (j=0; j<ncols; j++) {
1404:       bcol = bs*(*aj);
1405:       for (kcol=0; kcol<bs; kcol++) {
1406:         col = bcol + kcol;      /* col index */
1407:         for (krow=0; krow<bs; krow++) {
1408:           atmp = PetscAbsScalar(*aa); aa++;
1409:           row  = brow + krow;   /* row index */
1410:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1411:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1412:         }
1413:       }
1414:       aj++;
1415:     }
1416:   }
1417:   VecRestoreArray(v,&x);
1418:   return(0);
1419: }