Actual source code: sbaij2.c

petsc-3.3-p7 2013-05-11
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  6: #include <petscblaslapack.h>

 10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
 14:   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
 15:   const PetscInt *idx;
 16:   PetscBT        table_out,table_in;

 19:   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 20:   mbs = a->mbs;
 21:   ai  = a->i;
 22:   aj  = a->j;
 23:   bs  = A->rmap->bs;
 24:   PetscBTCreate(mbs,&table_out);
 25:   PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);
 26:   PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
 27:   PetscBTCreate(mbs,&table_in);

 29:   for (i=0; i<is_max; i++) { /* for each is */
 30:     isz  = 0;
 31:     PetscBTMemzero(mbs,table_out);
 32: 
 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 38:     bcol_max = 0;
 39:     for (j=0; j<n ; ++j){
 40:       brow = idx[j]/bs; /* convert the indices into block indices */
 41:       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if(!PetscBTLookupSet(table_out,brow)) {
 43:         nidx[isz++] = brow;
 44:         if (bcol_max < brow) bcol_max = brow;
 45:       }
 46:     }
 47:     ISRestoreIndices(is[i],&idx);
 48:     ISDestroy(&is[i]);
 49: 
 50:     k = 0;
 51:     for (j=0; j<ov; j++){ /* for each overlap */
 52:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 53:       PetscBTMemzero(mbs,table_in);
 54:       for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }

 56:       n = isz;  /* length of the updated is[i] */
 57:       for (brow=0; brow<mbs; brow++){
 58:         start = ai[brow]; end   = ai[brow+1];
 59:         if (PetscBTLookup(table_in,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
 60:           for (l = start; l<end ; l++){
 61:             bcol = aj[l];
 62:             if (!PetscBTLookupSet(table_out,bcol)) {
 63:               nidx[isz++] = bcol;
 64:               if (bcol_max < bcol) bcol_max = bcol;
 65:             }
 66:           }
 67:           k++;
 68:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 69:         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 70:           for (l = start; l<end ; l++){
 71:             bcol = aj[l];
 72:             if (bcol > bcol_max) break;
 73:             if (PetscBTLookup(table_in,bcol)){
 74:               if (!PetscBTLookupSet(table_out,brow)) {nidx[isz++] = brow;}
 75:               break; /* for l = start; l<end ; l++) */
 76:             }
 77:           }
 78:         }
 79:       }
 80:     } /* for each overlap */

 82:     /* expand the Index Set */
 83:     for (j=0; j<isz; j++) {
 84:       for (k=0; k<bs; k++)
 85:         nidx2[j*bs+k] = nidx[j]*bs+k;
 86:     }
 87:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 88:   }
 89:   PetscBTDestroy(&table_out);
 90:   PetscFree(nidx);
 91:   PetscFree(nidx2);
 92:   PetscBTDestroy(&table_in);
 93:   return(0);
 94: }

 98: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 99: {
100:   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
102:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
103:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
104:   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
105:   const PetscInt *irow,*aj = a->j,*ai = a->i;
106:   MatScalar      *mat_a;
107:   Mat            C;
108:   PetscBool      flag,sorted;

111:   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
112:   ISSorted(iscol,&sorted);
113:   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

115:   ISGetIndices(isrow,&irow);
116:   ISGetSize(isrow,&nrows);
117: 
118:   PetscMalloc(oldcols*sizeof(PetscInt),&smap);
119:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
120:   ssmap = smap;
121:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
122:   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
123:   /* determine lens of each row */
124:   for (i=0; i<nrows; i++) {
125:     kstart  = ai[irow[i]];
126:     kend    = kstart + a->ilen[irow[i]];
127:     lens[i] = 0;
128:       for (k=kstart; k<kend; k++) {
129:         if (ssmap[aj[k]]) {
130:           lens[i]++;
131:         }
132:       }
133:     }
134:   /* Create and fill new matrix */
135:   if (scall == MAT_REUSE_MATRIX) {
136:     c = (Mat_SeqSBAIJ *)((*B)->data);

138:     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
139:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
140:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
141:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
142:     C = *B;
143:   } else {
144:     MatCreate(((PetscObject)A)->comm,&C);
145:     MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);
146:     MatSetType(C,((PetscObject)A)->type_name);
147:     MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
148:   }
149:   c = (Mat_SeqSBAIJ *)(C->data);
150:   for (i=0; i<nrows; i++) {
151:     row    = irow[i];
152:     kstart = ai[row];
153:     kend   = kstart + a->ilen[row];
154:     mat_i  = c->i[i];
155:     mat_j  = c->j + mat_i;
156:     mat_a  = c->a + mat_i*bs2;
157:     mat_ilen = c->ilen + i;
158:     for (k=kstart; k<kend; k++) {
159:       if ((tcol=ssmap[a->j[k]])) {
160:         *mat_j++ = tcol - 1;
161:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
162:         mat_a   += bs2;
163:         (*mat_ilen)++;
164:       }
165:     }
166:   }
167: 
168:   /* Free work space */
169:   PetscFree(smap);
170:   PetscFree(lens);
171:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
172:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
173: 
174:   ISRestoreIndices(isrow,&irow);
175:   *B = C;
176:   return(0);
177: }

181: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
182: {
183:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184:   IS             is1;
186:   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187:   const PetscInt *irow;

190:   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191: 
192:   ISGetIndices(isrow,&irow);
193:   ISGetSize(isrow,&nrows);
194: 
195:   /* Verify if the indices corespond to each element in a block 
196:    and form the IS with compressed IS */
197:   PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
198:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
199:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200: 
201:   count = 0;
202:   for (i=0; i<a->mbs; i++) {
203:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204:     if (vary[i]==bs) iary[count++] = i;
205:   }
206:   ISRestoreIndices(isrow,&irow);
207:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
208:   PetscFree2(vary,iary);

210:   MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);
211:   ISDestroy(&is1);
212:   return(0);
213: }

217: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218: {
220:   PetscInt       i;

223:   if (scall == MAT_INITIAL_MATRIX) {
224:     PetscMalloc((n+1)*sizeof(Mat),B);
225:   }

227:   for (i=0; i<n; i++) {
228:     MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
229:   }
230:   return(0);
231: }

233: /* -------------------------------------------------------*/
234: /* Should check that shapes of vectors and matrices match */
235: /* -------------------------------------------------------*/

239: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
240: {
241:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
242:   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
243:   MatScalar      *v;
245:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
246:   PetscInt       nonzerorow=0;

249:   VecSet(zz,zero);
250:   VecGetArray(xx,&x);
251:   VecGetArray(zz,&z);
252: 
253:   v     = a->a;
254:   xb = x;

256:   for (i=0; i<mbs; i++) {
257:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
258:     x1 = xb[0]; x2 = xb[1];
259:     ib = aj + *ai;
260:     jmin = 0;
261:     nonzerorow += (n>0);
262:     if (*ib == i){     /* (diag of A)*x */
263:       z[2*i]   += v[0]*x1 + v[2]*x2;
264:       z[2*i+1] += v[2]*x1 + v[3]*x2;
265:       v += 4; jmin++;
266:     }
267:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
268:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
269:     for (j=jmin; j<n; j++) {
270:       /* (strict lower triangular part of A)*x  */
271:       cval       = ib[j]*2;
272:       z[cval]     += v[0]*x1 + v[1]*x2;
273:       z[cval+1]   += v[2]*x1 + v[3]*x2;
274:       /* (strict upper triangular part of A)*x  */
275:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
276:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
277:       v  += 4;
278:     }
279:     xb +=2; ai++;
280:   }

282:   VecRestoreArray(xx,&x);
283:   VecRestoreArray(zz,&z);
284:   PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
285:   return(0);
286: }

290: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
291: {
292:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
293:   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
294:   MatScalar      *v;
296:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
297:   PetscInt       nonzerorow=0;

300:   VecSet(zz,zero);
301:   VecGetArray(xx,&x);
302:   VecGetArray(zz,&z);
303: 
304:   v    = a->a;
305:   xb   = x;

307:   for (i=0; i<mbs; i++) {
308:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
309:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
310:     ib = aj + *ai;
311:     jmin = 0;
312:     nonzerorow += (n>0);
313:     if (*ib == i){     /* (diag of A)*x */
314:       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
315:       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
316:       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
317:       v += 9; jmin++;
318:     }
319:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321:     for (j=jmin; j<n; j++) {
322:       /* (strict lower triangular part of A)*x  */
323:       cval       = ib[j]*3;
324:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
325:       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
326:       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
327:       /* (strict upper triangular part of A)*x  */
328:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
329:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
330:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
331:       v  += 9;
332:     }
333:     xb +=3; ai++;
334:   }

336:   VecRestoreArray(xx,&x);
337:   VecRestoreArray(zz,&z);
338:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
339:   return(0);
340: }

344: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
345: {
346:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
347:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
348:   MatScalar      *v;
350:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
351:   PetscInt       nonzerorow=0;

354:   VecSet(zz,zero);
355:   VecGetArray(xx,&x);
356:   VecGetArray(zz,&z);
357: 
358:   v     = a->a;
359:   xb = x;

361:   for (i=0; i<mbs; i++) {
362:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
363:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
364:     ib = aj + *ai;
365:     jmin = 0;
366:     nonzerorow += (n>0);
367:     if (*ib == i){     /* (diag of A)*x */
368:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
369:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
370:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
371:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
372:       v += 16; jmin++;
373:     }
374:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
375:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
376:     for (j=jmin; j<n; j++) {
377:       /* (strict lower triangular part of A)*x  */
378:       cval       = ib[j]*4;
379:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
380:       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
381:       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
382:       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
383:       /* (strict upper triangular part of A)*x  */
384:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
385:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
386:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
387:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
388:       v  += 16;
389:     }
390:     xb +=4; ai++;
391:   }

393:   VecRestoreArray(xx,&x);
394:   VecRestoreArray(zz,&z);
395:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
396:   return(0);
397: }

401: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
402: {
403:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
404:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
405:   MatScalar      *v;
407:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
408:   PetscInt       nonzerorow=0;

411:   VecSet(zz,zero);
412:   VecGetArray(xx,&x);
413:   VecGetArray(zz,&z);
414: 
415:   v     = a->a;
416:   xb = x;

418:   for (i=0; i<mbs; i++) {
419:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
420:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
421:     ib = aj + *ai;
422:     jmin = 0;
423:     nonzerorow += (n>0);
424:     if (*ib == i){      /* (diag of A)*x */
425:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
426:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
427:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
428:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
429:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
430:       v += 25; jmin++;
431:     }
432:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
433:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
434:     for (j=jmin; j<n; j++) {
435:       /* (strict lower triangular part of A)*x  */
436:       cval       = ib[j]*5;
437:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
438:       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
439:       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
440:       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
441:       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
442:       /* (strict upper triangular part of A)*x  */
443:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
444:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
445:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
446:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
447:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
448:       v  += 25;
449:     }
450:     xb +=5; ai++;
451:   }

453:   VecRestoreArray(xx,&x);
454:   VecRestoreArray(zz,&z);
455:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
456:   return(0);
457: }


462: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
463: {
464:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
465:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
466:   MatScalar      *v;
468:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
469:   PetscInt       nonzerorow=0;

472:   VecSet(zz,zero);
473:   VecGetArray(xx,&x);
474:   VecGetArray(zz,&z);
475: 
476:   v     = a->a;
477:   xb = x;

479:   for (i=0; i<mbs; i++) {
480:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
481:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
482:     ib = aj + *ai;
483:     jmin = 0;
484:     nonzerorow += (n>0);
485:     if (*ib == i){      /* (diag of A)*x */
486:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
487:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
488:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
489:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
490:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
491:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
492:       v += 36; jmin++;
493:     }
494:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
495:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
496:     for (j=jmin; j<n; j++) {
497:       /* (strict lower triangular part of A)*x  */
498:       cval       = ib[j]*6;
499:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
500:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
501:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
502:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
503:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
504:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
505:       /* (strict upper triangular part of A)*x  */
506:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
507:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
508:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
509:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
510:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
511:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
512:       v  += 36;
513:     }
514:     xb +=6; ai++;
515:   }

517:   VecRestoreArray(xx,&x);
518:   VecRestoreArray(zz,&z);
519:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
520:   return(0);
521: }
524: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
525: {
526:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
527:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
528:   MatScalar      *v;
530:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
531:   PetscInt       nonzerorow=0;

534:   VecSet(zz,zero);
535:   VecGetArray(xx,&x);
536:   VecGetArray(zz,&z);
537: 
538:   v     = a->a;
539:   xb = x;

541:   for (i=0; i<mbs; i++) {
542:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
543:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
544:     ib = aj + *ai;
545:     jmin = 0;
546:     nonzerorow += (n>0);
547:     if (*ib == i){      /* (diag of A)*x */
548:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
549:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
550:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
551:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
552:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
553:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
554:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
555:       v += 49; jmin++;
556:     }
557:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
558:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
559:     for (j=jmin; j<n; j++) {
560:       /* (strict lower triangular part of A)*x  */
561:       cval       = ib[j]*7;
562:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
563:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
564:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
565:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
566:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
567:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
568:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569:       /* (strict upper triangular part of A)*x  */
570:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
571:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
572:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
573:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
574:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
575:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
576:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
577:       v  += 49;
578:     }
579:     xb +=7; ai++;
580:   }
581:   VecRestoreArray(xx,&x);
582:   VecRestoreArray(zz,&z);
583:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
584:   return(0);
585: }

587: /*
588:     This will not work with MatScalar == float because it calls the BLAS
589: */
592: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
593: {
594:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
595:   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
596:   MatScalar      *v;
598:   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
599:   PetscInt       nonzerorow=0;

602:   VecSet(zz,zero);
603:   VecGetArray(xx,&x); x_ptr=x;
604:   VecGetArray(zz,&z); z_ptr=z;

606:   aj   = a->j;
607:   v    = a->a;
608:   ii   = a->i;

610:   if (!a->mult_work) {
611:     PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);
612:   }
613:   work = a->mult_work;
614: 
615:   for (i=0; i<mbs; i++) {
616:     n     = ii[1] - ii[0]; ncols = n*bs;
617:     workt = work; idx=aj+ii[0];
618:     nonzerorow += (n>0);

620:     /* upper triangular part */
621:     for (j=0; j<n; j++) {
622:       xb = x_ptr + bs*(*idx++);
623:       for (k=0; k<bs; k++) workt[k] = xb[k];
624:       workt += bs;
625:     }
626:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
627:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
628: 
629:     /* strict lower triangular part */
630:     idx = aj+ii[0];
631:     if (*idx == i){
632:       ncols -= bs; v += bs2; idx++; n--;
633:     }
634: 
635:     if (ncols > 0){
636:       workt = work;
637:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
638:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
639:       for (j=0; j<n; j++) {
640:         zb = z_ptr + bs*(*idx++);
641:         for (k=0; k<bs; k++) zb[k] += workt[k] ;
642:         workt += bs;
643:       }
644:     }
645:     x += bs; v += n*bs2; z += bs; ii++;
646:   }
647: 
648:   VecRestoreArray(xx,&x);
649:   VecRestoreArray(zz,&z);
650:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
651:   return(0);
652: }

656: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
657: {
658:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
659:   PetscScalar    *x,*z,*xb,x1;
660:   MatScalar      *v;
662:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
663:   PetscInt       nonzerorow=0;

666:   VecCopy(yy,zz);
667:   VecGetArray(xx,&x);
668:   VecGetArray(zz,&z);
669:   v  = a->a;
670:   xb = x;

672:   for (i=0; i<mbs; i++) {
673:     n  = ai[1] - ai[0];  /* length of i_th row of A */
674:     x1 = xb[0];
675:     ib = aj + *ai;
676:     jmin = 0;
677:     nonzerorow += (n>0);
678:     if (*ib == i) {            /* (diag of A)*x */
679:       z[i] += *v++ * x[*ib++]; jmin++;
680:     }
681:     for (j=jmin; j<n; j++) {
682:       cval    = *ib;
683:       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
684:       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
685:     }
686:     xb++; ai++;
687:   }

689:   VecRestoreArray(xx,&x);
690:   VecRestoreArray(zz,&z);
691: 
692:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
693:   return(0);
694: }

698: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
699: {
700:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
701:   PetscScalar    *x,*z,*xb,x1,x2;
702:   MatScalar      *v;
704:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
705:   PetscInt       nonzerorow=0;

708:   VecCopy(yy,zz);
709:   VecGetArray(xx,&x);
710:   VecGetArray(zz,&z);

712:   v  = a->a;
713:   xb = x;

715:   for (i=0; i<mbs; i++) {
716:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
717:     x1 = xb[0]; x2 = xb[1];
718:     ib = aj + *ai;
719:     jmin = 0;
720:     nonzerorow += (n>0);
721:     if (*ib == i){      /* (diag of A)*x */
722:       z[2*i]   += v[0]*x1 + v[2]*x2;
723:       z[2*i+1] += v[2]*x1 + v[3]*x2;
724:       v += 4; jmin++;
725:     }
726:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
727:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
728:     for (j=jmin; j<n; j++) {
729:       /* (strict lower triangular part of A)*x  */
730:       cval       = ib[j]*2;
731:       z[cval]     += v[0]*x1 + v[1]*x2;
732:       z[cval+1]   += v[2]*x1 + v[3]*x2;
733:       /* (strict upper triangular part of A)*x  */
734:       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
735:       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
736:       v  += 4;
737:     }
738:     xb +=2; ai++;
739:   }
740:   VecRestoreArray(xx,&x);
741:   VecRestoreArray(zz,&z);

743:   PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
744:   return(0);
745: }

749: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
750: {
751:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
752:   PetscScalar    *x,*z,*xb,x1,x2,x3;
753:   MatScalar      *v;
755:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
756:   PetscInt       nonzerorow=0;

759:   VecCopy(yy,zz);
760:   VecGetArray(xx,&x);
761:   VecGetArray(zz,&z);

763:   v     = a->a;
764:   xb = x;

766:   for (i=0; i<mbs; i++) {
767:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
768:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
769:     ib = aj + *ai;
770:     jmin = 0;
771:     nonzerorow += (n>0);
772:     if (*ib == i){     /* (diag of A)*x */
773:      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
774:      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
775:      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
776:      v += 9; jmin++;
777:     }
778:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
779:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
780:     for (j=jmin; j<n; j++) {
781:       /* (strict lower triangular part of A)*x  */
782:       cval       = ib[j]*3;
783:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
784:       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
785:       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
786:       /* (strict upper triangular part of A)*x  */
787:       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
788:       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
789:       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
790:       v  += 9;
791:     }
792:     xb +=3; ai++;
793:   }

795:   VecRestoreArray(xx,&x);
796:   VecRestoreArray(zz,&z);

798:   PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
799:   return(0);
800: }

804: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
805: {
806:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
807:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
808:   MatScalar      *v;
810:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
811:   PetscInt       nonzerorow=0;

814:   VecCopy(yy,zz);
815:   VecGetArray(xx,&x);
816:   VecGetArray(zz,&z);

818:   v     = a->a;
819:   xb = x;

821:   for (i=0; i<mbs; i++) {
822:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
823:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
824:     ib = aj + *ai;
825:     jmin = 0;
826:     nonzerorow += (n>0);
827:     if (*ib == i){      /* (diag of A)*x */
828:       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
829:       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
830:       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
831:       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
832:       v += 16; jmin++;
833:     }
834:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
835:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
836:     for (j=jmin; j<n; j++) {
837:       /* (strict lower triangular part of A)*x  */
838:       cval       = ib[j]*4;
839:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
840:       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
841:       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
842:       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
843:       /* (strict upper triangular part of A)*x  */
844:       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
845:       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
846:       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
847:       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
848:       v  += 16;
849:     }
850:     xb +=4; ai++;
851:   }

853:   VecRestoreArray(xx,&x);
854:   VecRestoreArray(zz,&z);

856:   PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
857:   return(0);
858: }

862: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
863: {
864:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
865:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
866:   MatScalar      *v;
868:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
869:   PetscInt       nonzerorow=0;

872:   VecCopy(yy,zz);
873:   VecGetArray(xx,&x);
874:   VecGetArray(zz,&z);

876:   v     = a->a;
877:   xb = x;

879:   for (i=0; i<mbs; i++) {
880:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
881:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
882:     ib = aj + *ai;
883:     jmin = 0;
884:     nonzerorow += (n>0);
885:     if (*ib == i){      /* (diag of A)*x */
886:       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
887:       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
888:       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
889:       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
890:       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
891:       v += 25; jmin++;
892:     }
893:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
894:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
895:     for (j=jmin; j<n; j++) {
896:       /* (strict lower triangular part of A)*x  */
897:       cval       = ib[j]*5;
898:       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
899:       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
900:       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
901:       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
902:       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
903:       /* (strict upper triangular part of A)*x  */
904:       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
905:       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
906:       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
907:       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
908:       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
909:       v  += 25;
910:     }
911:     xb +=5; ai++;
912:   }

914:   VecRestoreArray(xx,&x);
915:   VecRestoreArray(zz,&z);

917:   PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
918:   return(0);
919: }
922: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
923: {
924:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
925:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
926:   MatScalar      *v;
928:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
929:   PetscInt       nonzerorow=0;

932:   VecCopy(yy,zz);
933:   VecGetArray(xx,&x);
934:   VecGetArray(zz,&z);

936:   v     = a->a;
937:   xb = x;

939:   for (i=0; i<mbs; i++) {
940:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
941:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
942:     ib = aj + *ai;
943:     jmin = 0;
944:     nonzerorow += (n>0);
945:     if (*ib == i){     /* (diag of A)*x */
946:       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
947:       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
948:       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
949:       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
950:       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
951:       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
952:       v += 36; jmin++;
953:     }
954:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
955:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956:     for (j=jmin; j<n; j++) {
957:       /* (strict lower triangular part of A)*x  */
958:       cval       = ib[j]*6;
959:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
960:       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
961:       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
962:       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
963:       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
964:       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
965:       /* (strict upper triangular part of A)*x  */
966:       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
967:       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
968:       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
969:       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
970:       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
971:       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
972:       v  += 36;
973:     }
974:     xb +=6; ai++;
975:   }

977:   VecRestoreArray(xx,&x);
978:   VecRestoreArray(zz,&z);

980:   PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
981:   return(0);
982: }

986: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
987: {
988:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
989:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
990:   MatScalar      *v;
992:   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
993:   PetscInt       nonzerorow=0;

996:   VecCopy(yy,zz);
997:   VecGetArray(xx,&x);
998:   VecGetArray(zz,&z);

1000:   v     = a->a;
1001:   xb = x;

1003:   for (i=0; i<mbs; i++) {
1004:     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1005:     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1006:     ib = aj + *ai;
1007:     jmin = 0;
1008:     nonzerorow += (n>0);
1009:     if (*ib == i){     /* (diag of A)*x */
1010:       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1011:       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1012:       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1013:       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1014:       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1015:       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1016:       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1017:       v += 49; jmin++;
1018:     }
1019:     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1020:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021:     for (j=jmin; j<n; j++) {
1022:       /* (strict lower triangular part of A)*x  */
1023:       cval       = ib[j]*7;
1024:       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1025:       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1026:       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1027:       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1028:       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1029:       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1030:       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1031:       /* (strict upper triangular part of A)*x  */
1032:       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1033:       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1034:       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1035:       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1036:       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1037:       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1038:       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1039:       v  += 49;
1040:     }
1041:     xb +=7; ai++;
1042:   }

1044:   VecRestoreArray(xx,&x);
1045:   VecRestoreArray(zz,&z);

1047:   PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1048:   return(0);
1049: }

1053: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1054: {
1055:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1056:   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1057:   MatScalar      *v;
1059:   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1060:   PetscInt       nonzerorow=0;

1063:   VecCopy(yy,zz);
1064:   VecGetArray(xx,&x); x_ptr=x;
1065:   VecGetArray(zz,&z); z_ptr=z;

1067:   aj   = a->j;
1068:   v    = a->a;
1069:   ii   = a->i;

1071:   if (!a->mult_work) {
1072:     PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);
1073:   }
1074:   work = a->mult_work;
1075: 
1076: 
1077:   for (i=0; i<mbs; i++) {
1078:     n     = ii[1] - ii[0]; ncols = n*bs;
1079:     workt = work; idx=aj+ii[0];
1080:     nonzerorow += (n>0);

1082:     /* upper triangular part */
1083:     for (j=0; j<n; j++) {
1084:       xb = x_ptr + bs*(*idx++);
1085:       for (k=0; k<bs; k++) workt[k] = xb[k];
1086:       workt += bs;
1087:     }
1088:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1089:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);

1091:     /* strict lower triangular part */
1092:     idx = aj+ii[0];
1093:     if (*idx == i){
1094:       ncols -= bs; v += bs2; idx++; n--;
1095:     }
1096:     if (ncols > 0){
1097:       workt = work;
1098:       PetscMemzero(workt,ncols*sizeof(PetscScalar));
1099:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1100:       for (j=0; j<n; j++) {
1101:         zb = z_ptr + bs*(*idx++);
1102:         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1103:         workt += bs;
1104:       }
1105:     }

1107:     x += bs; v += n*bs2; z += bs; ii++;
1108:   }

1110:   VecRestoreArray(xx,&x);
1111:   VecRestoreArray(zz,&z);

1113:   PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1114:   return(0);
1115: }

1119: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1120: {
1121:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1122:   PetscScalar    oalpha = alpha;
1124:   PetscBLASInt   one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);

1127:   BLASscal_(&totalnz,&oalpha,a->a,&one);
1128:   PetscLogFlops(totalnz);
1129:   return(0);
1130: }

1134: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1135: {
1136:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1137:   MatScalar      *v = a->a;
1138:   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1139:   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1141:   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1142: 
1144:   if (type == NORM_FROBENIUS) {
1145:     for (k=0; k<mbs; k++){
1146:       jmin = a->i[k]; jmax = a->i[k+1];
1147:       col  = aj + jmin;
1148:       if (*col == k){         /* diagonal block */
1149:         for (i=0; i<bs2; i++){
1150: #if defined(PETSC_USE_COMPLEX)
1151:           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1152: #else
1153:           sum_diag += (*v)*(*v); v++;
1154: #endif
1155:         }
1156:         jmin++;
1157:       }
1158:       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1159:         for (i=0; i<bs2; i++){
1160: #if defined(PETSC_USE_COMPLEX)
1161:           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1162: #else
1163:           sum_off += (*v)*(*v); v++;
1164: #endif  
1165:         }
1166:       }
1167:     }
1168:     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1169:   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1170:     PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);
1171:     for (i=0; i<mbs; i++) jl[i] = mbs;
1172:     il[0] = 0;

1174:     *norm = 0.0;
1175:     for (k=0; k<mbs; k++) { /* k_th block row */
1176:       for (j=0; j<bs; j++) sum[j]=0.0;
1177:       /*-- col sum --*/
1178:       i = jl[k]; /* first |A(i,k)| to be added */
1179:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1180:                   at step k */
1181:       while (i<mbs){
1182:         nexti = jl[i];  /* next block row to be added */
1183:         ik    = il[i];  /* block index of A(i,k) in the array a */
1184:         for (j=0; j<bs; j++){
1185:           v = a->a + ik*bs2 + j*bs;
1186:           for (k1=0; k1<bs; k1++) {
1187:             sum[j] += PetscAbsScalar(*v); v++;
1188:           }
1189:         }
1190:         /* update il, jl */
1191:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1192:         jmax = a->i[i+1];
1193:         if (jmin < jmax){
1194:           il[i] = jmin;
1195:           j   = a->j[jmin];
1196:           jl[i] = jl[j]; jl[j]=i;
1197:         }
1198:         i = nexti;
1199:       }
1200:       /*-- row sum --*/
1201:       jmin = a->i[k]; jmax = a->i[k+1];
1202:       for (i=jmin; i<jmax; i++) {
1203:         for (j=0; j<bs; j++){
1204:           v = a->a + i*bs2 + j;
1205:           for (k1=0; k1<bs; k1++){
1206:             sum[j] += PetscAbsScalar(*v); v += bs;
1207:           }
1208:         }
1209:       }
1210:       /* add k_th block row to il, jl */
1211:       col = aj+jmin;
1212:       if (*col == k) jmin++;
1213:       if (jmin < jmax){
1214:         il[k] = jmin;
1215:         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1216:       }
1217:       for (j=0; j<bs; j++){
1218:         if (sum[j] > *norm) *norm = sum[j];
1219:       }
1220:     }
1221:     PetscFree3(sum,il,jl);
1222:   } else {
1223:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1224:   }
1225:   return(0);
1226: }

1230: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1231: {
1232:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;


1237:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1238:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1239:     *flg = PETSC_FALSE;
1240:     return(0);
1241:   }
1242: 
1243:   /* if the a->i are the same */
1244:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1245:   if (!*flg) {
1246:     return(0);
1247:   }
1248: 
1249:   /* if a->j are the same */
1250:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1251:   if (!*flg) {
1252:     return(0);
1253:   }
1254:   /* if a->a are the same */
1255:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1256:   return(0);
1257: }

1261: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1262: {
1263:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1265:   PetscInt       i,j,k,row,bs,*ai,*aj,ambs,bs2;
1266:   PetscScalar    *x,zero = 0.0;
1267:   MatScalar      *aa,*aa_j;

1270:   bs   = A->rmap->bs;
1271:   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1272: 
1273:   aa   = a->a;
1274:   ambs = a->mbs;

1276:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC){
1277:     PetscInt *diag=a->diag;
1278:     aa   = a->a;
1279:     ambs = a->mbs;
1280:     VecGetArray(v,&x);
1281:     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1282:     VecRestoreArray(v,&x);
1283:     return(0);
1284:   }
1285: 
1286:   ai   = a->i;
1287:   aj   = a->j;
1288:   bs2  = a->bs2;
1289:   VecSet(v,zero);
1290:   VecGetArray(v,&x);
1291:   for (i=0; i<ambs; i++) {
1292:     j=ai[i];
1293:     if (aj[j] == i) {    /* if this is a diagonal element */
1294:       row  = i*bs;
1295:       aa_j = aa + j*bs2;
1296:       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1297:     }
1298:   }
1299:   VecRestoreArray(v,&x);
1300:   return(0);
1301: }

1305: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1306: {
1307:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1308:   PetscScalar    *l,x,*li,*ri;
1309:   MatScalar      *aa,*v;
1311:   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1312:   PetscBool      flg;

1315:   if (ll != rr){
1316:     VecEqual(ll,rr,&flg);
1317:     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1318:   }
1319:   if (!ll) return(0);
1320:   ai  = a->i;
1321:   aj  = a->j;
1322:   aa  = a->a;
1323:   m   = A->rmap->N;
1324:   bs  = A->rmap->bs;
1325:   mbs = a->mbs;
1326:   bs2 = a->bs2;

1328:   VecGetArray(ll,&l);
1329:   VecGetLocalSize(ll,&lm);
1330:   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1331:   for (i=0; i<mbs; i++) { /* for each block row */
1332:     M  = ai[i+1] - ai[i];
1333:     li = l + i*bs;
1334:     v  = aa + bs2*ai[i];
1335:     for (j=0; j<M; j++) { /* for each block */
1336:       ri = l + bs*aj[ai[i]+j];
1337:       for (k=0; k<bs; k++) {
1338:         x = ri[k];
1339:         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1340:       }
1341:     }
1342:   }
1343:   VecRestoreArray(ll,&l);
1344:   PetscLogFlops(2.0*a->nz);
1345:   return(0);
1346: }

1350: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1351: {
1352:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1355:   info->block_size     = a->bs2;
1356:   info->nz_allocated   = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1357:   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1358:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1359:   info->assemblies   = A->num_ass;
1360:   info->mallocs      = A->info.mallocs;
1361:   info->memory       = ((PetscObject)A)->mem;
1362:   if (A->factortype) {
1363:     info->fill_ratio_given  = A->info.fill_ratio_given;
1364:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1365:     info->factor_mallocs    = A->info.factor_mallocs;
1366:   } else {
1367:     info->fill_ratio_given  = 0;
1368:     info->fill_ratio_needed = 0;
1369:     info->factor_mallocs    = 0;
1370:   }
1371:   return(0);
1372: }


1377: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1378: {
1379:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;

1383:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1384:   return(0);
1385: }

1389: /* 
1390:    This code does not work since it only checks the upper triangular part of
1391:   the matrix. Hence it is not listed in the function table.
1392: */
1393: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1394: {
1395:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1397:   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1398:   PetscReal      atmp;
1399:   MatScalar      *aa;
1400:   PetscScalar    *x;
1401:   PetscInt       ncols,brow,bcol,krow,kcol;

1404:   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1405:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1406:   bs   = A->rmap->bs;
1407:   aa   = a->a;
1408:   ai   = a->i;
1409:   aj   = a->j;
1410:   mbs = a->mbs;

1412:   VecSet(v,0.0);
1413:   VecGetArray(v,&x);
1414:   VecGetLocalSize(v,&n);
1415:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1416:   for (i=0; i<mbs; i++) {
1417:     ncols = ai[1] - ai[0]; ai++;
1418:     brow  = bs*i;
1419:     for (j=0; j<ncols; j++){
1420:       bcol = bs*(*aj);
1421:       for (kcol=0; kcol<bs; kcol++){
1422:         col = bcol + kcol;      /* col index */
1423:         for (krow=0; krow<bs; krow++){
1424:           atmp = PetscAbsScalar(*aa); aa++;
1425:           row = brow + krow;    /* row index */
1426:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1427:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1428:         }
1429:       }
1430:       aj++;
1431:     }
1432:   }
1433:   VecRestoreArray(v,&x);
1434:   return(0);
1435: }