Actual source code: baij2.c

 2:  #include src/mat/impls/baij/seq/baij.h
 3:  #include src/inline/spops.h
 4:  #include src/inline/ilu.h
 5:  #include petscbt.h

  9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 10: {
 11:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 13:   PetscInt       row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
 14:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 15:   PetscBT        table;

 18:   m     = a->mbs;
 19:   ai    = a->i;
 20:   aj    = a->j;
 21:   bs    = A->bs;

 23:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 25:   PetscBTCreate(m,table);
 26:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 27:   PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);

 29:   for (i=0; i<is_max; i++) {
 30:     /* Initialise the two local arrays */
 31:     isz  = 0;
 32:     PetscBTMemzero(m,table);
 33: 
 34:     /* Extract the indices, assume there can be duplicate entries */
 35:     ISGetIndices(is[i],&idx);
 36:     ISGetLocalSize(is[i],&n);

 38:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 39:     for (j=0; j<n ; ++j){
 40:       ival = idx[j]/bs; /* convert the indices into block indices */
 41:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 43:     }
 44:     ISRestoreIndices(is[i],&idx);
 45:     ISDestroy(is[i]);
 46: 
 47:     k = 0;
 48:     for (j=0; j<ov; j++){ /* for each overlap*/
 49:       n = isz;
 50:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 51:         row   = nidx[k];
 52:         start = ai[row];
 53:         end   = ai[row+1];
 54:         for (l = start; l<end ; l++){
 55:           val = aj[l];
 56:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 57:         }
 58:       }
 59:     }
 60:     /* expand the Index Set */
 61:     for (j=0; j<isz; j++) {
 62:       for (k=0; k<bs; k++)
 63:         nidx2[j*bs+k] = nidx[j]*bs+k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 66:   }
 67:   PetscBTDestroy(table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return(0);
 71: }

 75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 76: {
 77:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 79:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 80:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 81:   PetscInt       *irow,*icol,nrows,ncols,*ssmap,bs=A->bs,bs2=a->bs2;
 82:   PetscInt       *aj = a->j,*ai = a->i;
 83:   MatScalar      *mat_a;
 84:   Mat            C;
 85:   PetscTruth     flag;

 88:   ISSorted(iscol,(PetscTruth*)&i);
 89:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 91:   ISGetIndices(isrow,&irow);
 92:   ISGetIndices(iscol,&icol);
 93:   ISGetLocalSize(isrow,&nrows);
 94:   ISGetLocalSize(iscol,&ncols);

 96:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 97:   ssmap = smap;
 98:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
 99:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
100:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
101:   /* determine lens of each row */
102:   for (i=0; i<nrows; i++) {
103:     kstart  = ai[irow[i]];
104:     kend    = kstart + a->ilen[irow[i]];
105:     lens[i] = 0;
106:       for (k=kstart; k<kend; k++) {
107:         if (ssmap[aj[k]]) {
108:           lens[i]++;
109:         }
110:       }
111:     }
112:   /* Create and fill new matrix */
113:   if (scall == MAT_REUSE_MATRIX) {
114:     c = (Mat_SeqBAIJ *)((*B)->data);

116:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
117:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
118:     if (flag == PETSC_FALSE) {
119:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120:     }
121:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
122:     C = *B;
123:   } else {
124:     MatCreate(A->comm,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);
125:     MatSetType(C,A->type_name);
126:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
127:   }
128:   c = (Mat_SeqBAIJ *)(C->data);
129:   for (i=0; i<nrows; i++) {
130:     row    = irow[i];
131:     kstart = ai[row];
132:     kend   = kstart + a->ilen[row];
133:     mat_i  = c->i[i];
134:     mat_j  = c->j + mat_i;
135:     mat_a  = c->a + mat_i*bs2;
136:     mat_ilen = c->ilen + i;
137:     for (k=kstart; k<kend; k++) {
138:       if ((tcol=ssmap[a->j[k]])) {
139:         *mat_j++ = tcol - 1;
140:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
141:         mat_a   += bs2;
142:         (*mat_ilen)++;
143:       }
144:     }
145:   }
146: 
147:   /* Free work space */
148:   ISRestoreIndices(iscol,&icol);
149:   PetscFree(smap);
150:   PetscFree(lens);
151:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
152:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
153: 
154:   ISRestoreIndices(isrow,&irow);
155:   *B = C;
156:   return(0);
157: }

161: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
162: {
163:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
164:   IS             is1,is2;
166:   PetscInt       *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->bs,count;

169:   ISGetIndices(isrow,&irow);
170:   ISGetIndices(iscol,&icol);
171:   ISGetLocalSize(isrow,&nrows);
172:   ISGetLocalSize(iscol,&ncols);
173: 
174:   /* Verify if the indices corespond to each element in a block 
175:    and form the IS with compressed IS */
176:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
177:   iary = vary + a->mbs;
178:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
179:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180:   count = 0;
181:   for (i=0; i<a->mbs; i++) {
182:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183:     if (vary[i]==bs) iary[count++] = i;
184:   }
185:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
186: 
187:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
188:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189:   count = 0;
190:   for (i=0; i<a->mbs; i++) {
191:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
192:     if (vary[i]==bs) iary[count++] = i;
193:   }
194:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
195:   ISRestoreIndices(isrow,&irow);
196:   ISRestoreIndices(iscol,&icol);
197:   PetscFree(vary);

199:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
200:   ISDestroy(is1);
201:   ISDestroy(is2);
202:   return(0);
203: }

207: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208: {
210:   PetscInt       i;

213:   if (scall == MAT_INITIAL_MATRIX) {
214:     PetscMalloc((n+1)*sizeof(Mat),B);
215:   }

217:   for (i=0; i<n; i++) {
218:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
219:   }
220:   return(0);
221: }


224: /* -------------------------------------------------------*/
225: /* Should check that shapes of vectors and matrices match */
226: /* -------------------------------------------------------*/
227:  #include petscblaslapack.h

231: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
232: {
233:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
234:   PetscScalar    *x,*z,sum;
235:   MatScalar      *v;
237:   PetscInt       mbs=a->mbs,i,*idx,*ii,n;

240:   VecGetArray(xx,&x);
241:   VecGetArray(zz,&z);

243:   idx   = a->j;
244:   v     = a->a;
245:   ii    = a->i;

247:   for (i=0; i<mbs; i++) {
248:     n    = ii[1] - ii[0]; ii++;
249:     sum  = 0.0;
250:     while (n--) sum += *v++ * x[*idx++];
251:     z[i] = sum;
252:   }
253:   VecRestoreArray(xx,&x);
254:   VecRestoreArray(zz,&z);
255:   PetscLogFlops(2*a->nz - A->m);
256:   return(0);
257: }

261: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
262: {
263:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
264:   PetscScalar    *x,*z,*xb,sum1,sum2;
265:   PetscScalar    x1,x2;
266:   MatScalar      *v;
268:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

271:   VecGetArray(xx,&x);
272:   VecGetArray(zz,&z);

274:   idx   = a->j;
275:   v     = a->a;
276:   ii    = a->i;

278:   for (i=0; i<mbs; i++) {
279:     n  = ii[1] - ii[0]; ii++;
280:     sum1 = 0.0; sum2 = 0.0;
281:     for (j=0; j<n; j++) {
282:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
283:       sum1 += v[0]*x1 + v[2]*x2;
284:       sum2 += v[1]*x1 + v[3]*x2;
285:       v += 4;
286:     }
287:     z[0] = sum1; z[1] = sum2;
288:     z += 2;
289:   }
290:   VecRestoreArray(xx,&x);
291:   VecRestoreArray(zz,&z);
292:   PetscLogFlops(8*a->nz - A->m);
293:   return(0);
294: }

298: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
299: {
300:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
301:   PetscScalar    *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
302:   MatScalar      *v;
304:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

306: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
307: #pragma disjoint(*v,*z,*xb)
308: #endif

311:   VecGetArray(xx,&x);
312:   VecGetArray(zz,&z);

314:   idx   = a->j;
315:   v     = a->a;
316:   ii    = a->i;

318:   for (i=0; i<mbs; i++) {
319:     n  = ii[1] - ii[0]; ii++;
320:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
321:     for (j=0; j<n; j++) {
322:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
323:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
324:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
325:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
326:       v += 9;
327:     }
328:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
329:     z += 3;
330:   }
331:   VecRestoreArray(xx,&x);
332:   VecRestoreArray(zz,&z);
333:   PetscLogFlops(18*a->nz - A->m);
334:   return(0);
335: }

339: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
340: {
341:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
342:   PetscScalar    *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
343:   MatScalar      *v;
345:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

348:   VecGetArray(xx,&x);
349:   VecGetArray(zz,&z);

351:   idx   = a->j;
352:   v     = a->a;
353:   ii    = a->i;

355:   for (i=0; i<mbs; i++) {
356:     n  = ii[1] - ii[0]; ii++;
357:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
358:     for (j=0; j<n; j++) {
359:       xb = x + 4*(*idx++);
360:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
361:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
362:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
363:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
364:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
365:       v += 16;
366:     }
367:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
368:     z += 4;
369:   }
370:   VecRestoreArray(xx,&x);
371:   VecRestoreArray(zz,&z);
372:   PetscLogFlops(32*a->nz - A->m);
373:   return(0);
374: }

378: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
379: {
380:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
381:   PetscScalar    sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
382:   MatScalar      *v;
384:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

387:   VecGetArray(xx,&x);
388:   VecGetArray(zz,&z);

390:   idx   = a->j;
391:   v     = a->a;
392:   ii    = a->i;

394:   for (i=0; i<mbs; i++) {
395:     n  = ii[1] - ii[0]; ii++;
396:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
397:     for (j=0; j<n; j++) {
398:       xb = x + 5*(*idx++);
399:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
400:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
401:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
402:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
403:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
404:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
405:       v += 25;
406:     }
407:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
408:     z += 5;
409:   }
410:   VecRestoreArray(xx,&x);
411:   VecRestoreArray(zz,&z);
412:   PetscLogFlops(50*a->nz - A->m);
413:   return(0);
414: }


419: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
420: {
421:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
422:   PetscScalar    *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
423:   PetscScalar    x1,x2,x3,x4,x5,x6;
424:   MatScalar      *v;
426:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

429:   VecGetArray(xx,&x);
430:   VecGetArray(zz,&z);

432:   idx   = a->j;
433:   v     = a->a;
434:   ii    = a->i;

436:   for (i=0; i<mbs; i++) {
437:     n  = ii[1] - ii[0]; ii++;
438:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
439:     for (j=0; j<n; j++) {
440:       xb = x + 6*(*idx++);
441:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
442:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
443:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
444:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
445:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
446:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
447:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
448:       v += 36;
449:     }
450:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
451:     z += 6;
452:   }

454:   VecRestoreArray(xx,&x);
455:   VecRestoreArray(zz,&z);
456:   PetscLogFlops(72*a->nz - A->m);
457:   return(0);
458: }
461: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
462: {
463:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
464:   PetscScalar    *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
465:   PetscScalar    x1,x2,x3,x4,x5,x6,x7;
466:   MatScalar      *v;
468:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

471:   VecGetArray(xx,&x);
472:   VecGetArray(zz,&z);

474:   idx   = a->j;
475:   v     = a->a;
476:   ii    = a->i;

478:   for (i=0; i<mbs; i++) {
479:     n  = ii[1] - ii[0]; ii++;
480:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
481:     for (j=0; j<n; j++) {
482:       xb = x + 7*(*idx++);
483:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
484:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
485:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
486:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
487:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
488:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
489:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
490:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
491:       v += 49;
492:     }
493:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
494:     z += 7;
495:   }

497:   VecRestoreArray(xx,&x);
498:   VecRestoreArray(zz,&z);
499:   PetscLogFlops(98*a->nz - A->m);
500:   return(0);
501: }

503: /*
504:     This will not work with MatScalar == float because it calls the BLAS
505: */
508: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
509: {
510:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
511:   PetscScalar    *x,*z,*xb,*work,*workt;
512:   MatScalar      *v;
514:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
515:   PetscInt       ncols,k;

518:   VecGetArray(xx,&x);
519:   VecGetArray(zz,&z);

521:   idx   = a->j;
522:   v     = a->a;
523:   ii    = a->i;


526:   if (!a->mult_work) {
527:     k    = PetscMax(A->m,A->n);
528:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
529:   }
530:   work = a->mult_work;
531:   for (i=0; i<mbs; i++) {
532:     n     = ii[1] - ii[0]; ii++;
533:     ncols = n*bs;
534:     workt = work;
535:     for (j=0; j<n; j++) {
536:       xb = x + bs*(*idx++);
537:       for (k=0; k<bs; k++) workt[k] = xb[k];
538:       workt += bs;
539:     }
540:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
541:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
542:     v += n*bs2;
543:     z += bs;
544:   }
545:   VecRestoreArray(xx,&x);
546:   VecRestoreArray(zz,&z);
547:   PetscLogFlops(2*a->nz*bs2 - A->m);
548:   return(0);
549: }

553: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
554: {
555:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
556:   PetscScalar    *x,*y,*z,sum;
557:   MatScalar      *v;
559:   PetscInt       mbs=a->mbs,i,*idx,*ii,n;

562:   VecGetArray(xx,&x);
563:   VecGetArray(yy,&y);
564:   if (zz != yy) {
565:     VecGetArray(zz,&z);
566:   } else {
567:     z = y;
568:   }

570:   idx   = a->j;
571:   v     = a->a;
572:   ii    = a->i;

574:   for (i=0; i<mbs; i++) {
575:     n    = ii[1] - ii[0]; ii++;
576:     sum  = y[i];
577:     while (n--) sum += *v++ * x[*idx++];
578:     z[i] = sum;
579:   }
580:   VecRestoreArray(xx,&x);
581:   VecRestoreArray(yy,&y);
582:   if (zz != yy) {
583:     VecRestoreArray(zz,&z);
584:   }
585:   PetscLogFlops(2*a->nz);
586:   return(0);
587: }

591: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
592: {
593:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
594:   PetscScalar    *x,*y,*z,*xb,sum1,sum2;
595:   PetscScalar    x1,x2;
596:   MatScalar      *v;
598:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

601:   VecGetArray(xx,&x);
602:   VecGetArray(yy,&y);
603:   if (zz != yy) {
604:     VecGetArray(zz,&z);
605:   } else {
606:     z = y;
607:   }

609:   idx   = a->j;
610:   v     = a->a;
611:   ii    = a->i;

613:   for (i=0; i<mbs; i++) {
614:     n  = ii[1] - ii[0]; ii++;
615:     sum1 = y[0]; sum2 = y[1];
616:     for (j=0; j<n; j++) {
617:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
618:       sum1 += v[0]*x1 + v[2]*x2;
619:       sum2 += v[1]*x1 + v[3]*x2;
620:       v += 4;
621:     }
622:     z[0] = sum1; z[1] = sum2;
623:     z += 2; y += 2;
624:   }
625:   VecRestoreArray(xx,&x);
626:   VecRestoreArray(yy,&y);
627:   if (zz != yy) {
628:     VecRestoreArray(zz,&z);
629:   }
630:   PetscLogFlops(4*a->nz);
631:   return(0);
632: }

636: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
637: {
638:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
639:   PetscScalar    *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
640:   MatScalar      *v;
642:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

645:   VecGetArray(xx,&x);
646:   VecGetArray(yy,&y);
647:   if (zz != yy) {
648:     VecGetArray(zz,&z);
649:   } else {
650:     z = y;
651:   }

653:   idx   = a->j;
654:   v     = a->a;
655:   ii    = a->i;

657:   for (i=0; i<mbs; i++) {
658:     n  = ii[1] - ii[0]; ii++;
659:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
660:     for (j=0; j<n; j++) {
661:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
662:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
663:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
664:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
665:       v += 9;
666:     }
667:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
668:     z += 3; y += 3;
669:   }
670:   VecRestoreArray(xx,&x);
671:   VecRestoreArray(yy,&y);
672:   if (zz != yy) {
673:     VecRestoreArray(zz,&z);
674:   }
675:   PetscLogFlops(18*a->nz);
676:   return(0);
677: }

681: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
682: {
683:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
684:   PetscScalar    *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
685:   MatScalar      *v;
687:   PetscInt       mbs=a->mbs,i,*idx,*ii;
688:   PetscInt       j,n;

691:   VecGetArray(xx,&x);
692:   VecGetArray(yy,&y);
693:   if (zz != yy) {
694:     VecGetArray(zz,&z);
695:   } else {
696:     z = y;
697:   }

699:   idx   = a->j;
700:   v     = a->a;
701:   ii    = a->i;

703:   for (i=0; i<mbs; i++) {
704:     n  = ii[1] - ii[0]; ii++;
705:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
706:     for (j=0; j<n; j++) {
707:       xb = x + 4*(*idx++);
708:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
709:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
710:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
711:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
712:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
713:       v += 16;
714:     }
715:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
716:     z += 4; y += 4;
717:   }
718:   VecRestoreArray(xx,&x);
719:   VecRestoreArray(yy,&y);
720:   if (zz != yy) {
721:     VecRestoreArray(zz,&z);
722:   }
723:   PetscLogFlops(32*a->nz);
724:   return(0);
725: }

729: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
730: {
731:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
732:   PetscScalar    *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
733:   MatScalar      *v;
735:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

738:   VecGetArray(xx,&x);
739:   VecGetArray(yy,&y);
740:   if (zz != yy) {
741:     VecGetArray(zz,&z);
742:   } else {
743:     z = y;
744:   }

746:   idx   = a->j;
747:   v     = a->a;
748:   ii    = a->i;

750:   for (i=0; i<mbs; i++) {
751:     n  = ii[1] - ii[0]; ii++;
752:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
753:     for (j=0; j<n; j++) {
754:       xb = x + 5*(*idx++);
755:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
756:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
757:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
758:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
759:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
760:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
761:       v += 25;
762:     }
763:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
764:     z += 5; y += 5;
765:   }
766:   VecRestoreArray(xx,&x);
767:   VecRestoreArray(yy,&y);
768:   if (zz != yy) {
769:     VecRestoreArray(zz,&z);
770:   }
771:   PetscLogFlops(50*a->nz);
772:   return(0);
773: }
776: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
777: {
778:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
779:   PetscScalar    *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
780:   PetscScalar    x1,x2,x3,x4,x5,x6;
781:   MatScalar      *v;
783:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

786:   VecGetArray(xx,&x);
787:   VecGetArray(yy,&y);
788:   if (zz != yy) {
789:     VecGetArray(zz,&z);
790:   } else {
791:     z = y;
792:   }

794:   idx   = a->j;
795:   v     = a->a;
796:   ii    = a->i;

798:   for (i=0; i<mbs; i++) {
799:     n  = ii[1] - ii[0]; ii++;
800:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
801:     for (j=0; j<n; j++) {
802:       xb = x + 6*(*idx++);
803:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
804:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
805:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
806:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
807:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
808:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
809:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
810:       v += 36;
811:     }
812:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
813:     z += 6; y += 6;
814:   }
815:   VecRestoreArray(xx,&x);
816:   VecRestoreArray(yy,&y);
817:   if (zz != yy) {
818:     VecRestoreArray(zz,&z);
819:   }
820:   PetscLogFlops(72*a->nz);
821:   return(0);
822: }

826: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
827: {
828:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
829:   PetscScalar    *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
830:   PetscScalar    x1,x2,x3,x4,x5,x6,x7;
831:   MatScalar      *v;
833:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n;

836:   VecGetArray(xx,&x);
837:   VecGetArray(yy,&y);
838:   if (zz != yy) {
839:     VecGetArray(zz,&z);
840:   } else {
841:     z = y;
842:   }

844:   idx   = a->j;
845:   v     = a->a;
846:   ii    = a->i;

848:   for (i=0; i<mbs; i++) {
849:     n  = ii[1] - ii[0]; ii++;
850:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
851:     for (j=0; j<n; j++) {
852:       xb = x + 7*(*idx++);
853:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
854:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
855:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
856:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
857:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
858:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
859:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
860:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
861:       v += 49;
862:     }
863:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
864:     z += 7; y += 7;
865:   }
866:   VecRestoreArray(xx,&x);
867:   VecRestoreArray(yy,&y);
868:   if (zz != yy) {
869:     VecRestoreArray(zz,&z);
870:   }
871:   PetscLogFlops(98*a->nz);
872:   return(0);
873: }

877: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
878: {
879:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
880:   PetscScalar    *x,*z,*xb,*work,*workt,*y;
881:   MatScalar      *v;
883:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
884:   PetscInt       ncols,k;

887:   VecGetArray(xx,&x);
888:   VecGetArray(zz,&z);
889:   if (zz != yy) {
890:     VecGetArray(yy,&y);
891:     PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
892:     VecRestoreArray(yy,&y);
893:   }

895:   idx   = a->j;
896:   v     = a->a;
897:   ii    = a->i;


900:   if (!a->mult_work) {
901:     k    = PetscMax(A->m,A->n);
902:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
903:   }
904:   work = a->mult_work;
905:   for (i=0; i<mbs; i++) {
906:     n     = ii[1] - ii[0]; ii++;
907:     ncols = n*bs;
908:     workt = work;
909:     for (j=0; j<n; j++) {
910:       xb = x + bs*(*idx++);
911:       for (k=0; k<bs; k++) workt[k] = xb[k];
912:       workt += bs;
913:     }
914:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
915:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
916:     v += n*bs2;
917:     z += bs;
918:   }
919:   VecRestoreArray(xx,&x);
920:   VecRestoreArray(zz,&z);
921:   PetscLogFlops(2*a->nz*bs2);
922:   return(0);
923: }

927: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
928: {
929:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
930:   PetscScalar    *xg,*zg,*zb,zero = 0.0;
931:   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
932:   MatScalar      *v;
933:   PetscInt       mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
935:   PetscInt       bs=A->bs,j,n,bs2=a->bs2,*ib;

938:   VecSet(&zero,zz);
939:   VecGetArray(xx,&xg); x = xg;
940:   VecGetArray(zz,&zg); z = zg;

942:   idx   = a->j;
943:   v     = a->a;
944:   ii    = a->i;
945:   xb    = x;
946:   switch (bs) {
947:   case 1:
948:     for (i=0; i<mbs; i++) {
949:       n  = ii[1] - ii[0]; ii++;
950:       x1 = xb[0];
951:       ib = idx + ai[i];
952:       for (j=0; j<n; j++) {
953:         rval    = ib[j];
954:         z[rval] += *v * x1;
955:         v++;
956:       }
957:       xb++;
958:     }
959:     break;
960:   case 2:
961:     for (i=0; i<mbs; i++) {
962:       n  = ii[1] - ii[0]; ii++;
963:       x1 = xb[0]; x2 = xb[1];
964:       ib = idx + ai[i];
965:       for (j=0; j<n; j++) {
966:         rval      = ib[j]*2;
967:         z[rval++] += v[0]*x1 + v[1]*x2;
968:         z[rval]   += v[2]*x1 + v[3]*x2;
969:         v  += 4;
970:       }
971:       xb += 2;
972:     }
973:     break;
974:   case 3:
975:     for (i=0; i<mbs; i++) {
976:       n  = ii[1] - ii[0]; ii++;
977:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
978:       ib = idx + ai[i];
979:       for (j=0; j<n; j++) {
980:         rval      = ib[j]*3;
981:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
982:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
983:         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
984:         v  += 9;
985:       }
986:       xb += 3;
987:     }
988:     break;
989:   case 4:
990:     for (i=0; i<mbs; i++) {
991:       n  = ii[1] - ii[0]; ii++;
992:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
993:       ib = idx + ai[i];
994:       for (j=0; j<n; j++) {
995:         rval      = ib[j]*4;
996:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
997:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
998:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
999:         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1000:         v  += 16;
1001:       }
1002:       xb += 4;
1003:     }
1004:     break;
1005:   case 5:
1006:     for (i=0; i<mbs; i++) {
1007:       n  = ii[1] - ii[0]; ii++;
1008:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1009:       x4 = xb[3]; x5 = xb[4];
1010:       ib = idx + ai[i];
1011:       for (j=0; j<n; j++) {
1012:         rval      = ib[j]*5;
1013:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1014:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1015:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1016:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1017:         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1018:         v  += 25;
1019:       }
1020:       xb += 5;
1021:     }
1022:     break;
1023:   case 6:
1024:     for (i=0; i<mbs; i++) {
1025:       n  = ii[1] - ii[0]; ii++;
1026:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1027:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1028:       ib = idx + ai[i];
1029:       for (j=0; j<n; j++) {
1030:         rval      = ib[j]*6;
1031:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
1032:         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1033:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1034:         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1035:         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1036:         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1037:         v  += 36;
1038:       }
1039:       xb += 6;
1040:     }
1041:     break;
1042:   case 7:
1043:     for (i=0; i<mbs; i++) {
1044:       n  = ii[1] - ii[0]; ii++;
1045:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1046:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1047:       ib = idx + ai[i];
1048:       for (j=0; j<n; j++) {
1049:         rval      = ib[j]*7;
1050:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1051:         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1052:         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1053:         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1054:         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1055:         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1056:         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1057:         v  += 49;
1058:       }
1059:       xb += 7;
1060:     }
1061:     break;
1062:   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1063:       PetscInt          ncols,k;
1064:       PetscScalar  *work,*workt;

1066:       if (!a->mult_work) {
1067:         k = PetscMax(A->m,A->n);
1068:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1069:       }
1070:       work = a->mult_work;
1071:       for (i=0; i<mbs; i++) {
1072:         n     = ii[1] - ii[0]; ii++;
1073:         ncols = n*bs;
1074:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1075:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1076:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1077:         v += n*bs2;
1078:         x += bs;
1079:         workt = work;
1080:         for (j=0; j<n; j++) {
1081:           zb = z + bs*(*idx++);
1082:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1083:           workt += bs;
1084:         }
1085:       }
1086:     }
1087:   }
1088:   VecRestoreArray(xx,&xg);
1089:   VecRestoreArray(zz,&zg);
1090:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1091:   return(0);
1092: }

1096: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1098: {
1099:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1100:   PetscScalar    *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1101:   MatScalar      *v;
1103:   PetscInt       mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=A->bs,j,n,bs2=a->bs2,*ib;

1106:   if (yy != zz) { VecCopy(yy,zz); }
1107:   VecGetArray(xx,&xg); x = xg;
1108:   VecGetArray(zz,&zg); z = zg;


1111:   idx   = a->j;
1112:   v     = a->a;
1113:   ii    = a->i;
1114:   xb    = x;

1116:   switch (bs) {
1117:   case 1:
1118:     for (i=0; i<mbs; i++) {
1119:       n  = ii[1] - ii[0]; ii++;
1120:       x1 = xb[0];
1121:       ib = idx + ai[i];
1122:       for (j=0; j<n; j++) {
1123:         rval    = ib[j];
1124:         z[rval] += *v * x1;
1125:         v++;
1126:       }
1127:       xb++;
1128:     }
1129:     break;
1130:   case 2:
1131:     for (i=0; i<mbs; i++) {
1132:       n  = ii[1] - ii[0]; ii++;
1133:       x1 = xb[0]; x2 = xb[1];
1134:       ib = idx + ai[i];
1135:       for (j=0; j<n; j++) {
1136:         rval      = ib[j]*2;
1137:         z[rval++] += v[0]*x1 + v[1]*x2;
1138:         z[rval++] += v[2]*x1 + v[3]*x2;
1139:         v  += 4;
1140:       }
1141:       xb += 2;
1142:     }
1143:     break;
1144:   case 3:
1145:     for (i=0; i<mbs; i++) {
1146:       n  = ii[1] - ii[0]; ii++;
1147:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1148:       ib = idx + ai[i];
1149:       for (j=0; j<n; j++) {
1150:         rval      = ib[j]*3;
1151:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1152:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1153:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1154:         v  += 9;
1155:       }
1156:       xb += 3;
1157:     }
1158:     break;
1159:   case 4:
1160:     for (i=0; i<mbs; i++) {
1161:       n  = ii[1] - ii[0]; ii++;
1162:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1163:       ib = idx + ai[i];
1164:       for (j=0; j<n; j++) {
1165:         rval      = ib[j]*4;
1166:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1167:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1168:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1169:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1170:         v  += 16;
1171:       }
1172:       xb += 4;
1173:     }
1174:     break;
1175:   case 5:
1176:     for (i=0; i<mbs; i++) {
1177:       n  = ii[1] - ii[0]; ii++;
1178:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1179:       x4 = xb[3]; x5 = xb[4];
1180:       ib = idx + ai[i];
1181:       for (j=0; j<n; j++) {
1182:         rval      = ib[j]*5;
1183:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1184:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1185:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1186:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1187:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1188:         v  += 25;
1189:       }
1190:       xb += 5;
1191:     }
1192:     break;
1193:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1194:       PetscInt          ncols,k;
1195:       PetscScalar  *work,*workt;

1197:       if (!a->mult_work) {
1198:         k = PetscMax(A->m,A->n);
1199:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1200:       }
1201:       work = a->mult_work;
1202:       for (i=0; i<mbs; i++) {
1203:         n     = ii[1] - ii[0]; ii++;
1204:         ncols = n*bs;
1205:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1206:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1207:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1208:         v += n*bs2;
1209:         x += bs;
1210:         workt = work;
1211:         for (j=0; j<n; j++) {
1212:           zb = z + bs*(*idx++);
1213:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1214:           workt += bs;
1215:         }
1216:       }
1217:     }
1218:   }
1219:   VecRestoreArray(xx,&xg);
1220:   VecRestoreArray(zz,&zg);
1221:   PetscLogFlops(2*a->nz*a->bs2);
1222:   return(0);
1223: }

1227: PetscErrorCode MatScale_SeqBAIJ(const PetscScalar *alpha,Mat inA)
1228: {
1229:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)inA->data;
1230:   PetscInt     totalnz = a->bs2*a->nz;
1231: #if defined(PETSC_USE_MAT_SINGLE)
1232:   PetscInt     i;
1233: #else
1234:   PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1235: #endif

1238: #if defined(PETSC_USE_MAT_SINGLE)
1239:   for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1240: #else
1241:   BLscal_(&tnz,(PetscScalar*)alpha,a->a,&one);
1242: #endif
1243:   PetscLogFlops(totalnz);
1244:   return(0);
1245: }

1249: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1250: {
1251:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1252:   MatScalar   *v = a->a;
1253:   PetscReal   sum = 0.0;
1254:   PetscInt    i,j,k,bs = A->bs,nz=a->nz,bs2=a->bs2,k1;

1257:   if (type == NORM_FROBENIUS) {
1258:     for (i=0; i< bs2*nz; i++) {
1259: #if defined(PETSC_USE_COMPLEX)
1260:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1261: #else
1262:       sum += (*v)*(*v); v++;
1263: #endif
1264:     }
1265:     *norm = sqrt(sum);
1266:   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1267:     *norm = 0.0;
1268:     for (k=0; k<bs; k++) {
1269:       for (j=0; j<a->mbs; j++) {
1270:         v = a->a + bs2*a->i[j] + k;
1271:         sum = 0.0;
1272:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1273:           for (k1=0; k1<bs; k1++){
1274:             sum += PetscAbsScalar(*v);
1275:             v   += bs;
1276:           }
1277:         }
1278:         if (sum > *norm) *norm = sum;
1279:       }
1280:     }
1281:   } else {
1282:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1283:   }
1284:   return(0);
1285: }


1290: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1291: {
1292:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1296:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1297:   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1298:     *flg = PETSC_FALSE;
1299:     return(0);
1300:   }
1301: 
1302:   /* if the a->i are the same */
1303:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1304:   if (*flg == PETSC_FALSE) {
1305:     return(0);
1306:   }
1307: 
1308:   /* if a->j are the same */
1309:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1310:   if (*flg == PETSC_FALSE) {
1311:     return(0);
1312:   }
1313:   /* if a->a are the same */
1314:   PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(B->bs)*sizeof(PetscScalar),flg);
1315:   return(0);
1316: 
1317: }

1321: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1322: {
1323:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1325:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1326:   PetscScalar    *x,zero = 0.0;
1327:   MatScalar      *aa,*aa_j;

1330:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1331:   bs   = A->bs;
1332:   aa   = a->a;
1333:   ai   = a->i;
1334:   aj   = a->j;
1335:   ambs = a->mbs;
1336:   bs2  = a->bs2;

1338:   VecSet(&zero,v);
1339:   VecGetArray(v,&x);
1340:   VecGetLocalSize(v,&n);
1341:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1342:   for (i=0; i<ambs; i++) {
1343:     for (j=ai[i]; j<ai[i+1]; j++) {
1344:       if (aj[j] == i) {
1345:         row  = i*bs;
1346:         aa_j = aa+j*bs2;
1347:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1348:         break;
1349:       }
1350:     }
1351:   }
1352:   VecRestoreArray(v,&x);
1353:   return(0);
1354: }

1358: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1359: {
1360:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1361:   PetscScalar    *l,*r,x,*li,*ri;
1362:   MatScalar      *aa,*v;
1364:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1367:   ai  = a->i;
1368:   aj  = a->j;
1369:   aa  = a->a;
1370:   m   = A->m;
1371:   n   = A->n;
1372:   bs  = A->bs;
1373:   mbs = a->mbs;
1374:   bs2 = a->bs2;
1375:   if (ll) {
1376:     VecGetArray(ll,&l);
1377:     VecGetLocalSize(ll,&lm);
1378:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1379:     for (i=0; i<mbs; i++) { /* for each block row */
1380:       M  = ai[i+1] - ai[i];
1381:       li = l + i*bs;
1382:       v  = aa + bs2*ai[i];
1383:       for (j=0; j<M; j++) { /* for each block */
1384:         for (k=0; k<bs2; k++) {
1385:           (*v++) *= li[k%bs];
1386:         }
1387:       }
1388:     }
1389:     VecRestoreArray(ll,&l);
1390:     PetscLogFlops(a->nz);
1391:   }
1392: 
1393:   if (rr) {
1394:     VecGetArray(rr,&r);
1395:     VecGetLocalSize(rr,&rn);
1396:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1397:     for (i=0; i<mbs; i++) { /* for each block row */
1398:       M  = ai[i+1] - ai[i];
1399:       v  = aa + bs2*ai[i];
1400:       for (j=0; j<M; j++) { /* for each block */
1401:         ri = r + bs*aj[ai[i]+j];
1402:         for (k=0; k<bs; k++) {
1403:           x = ri[k];
1404:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1405:         }
1406:       }
1407:     }
1408:     VecRestoreArray(rr,&r);
1409:     PetscLogFlops(a->nz);
1410:   }
1411:   return(0);
1412: }


1417: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1418: {
1419:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1422:   info->rows_global    = (double)A->m;
1423:   info->columns_global = (double)A->n;
1424:   info->rows_local     = (double)A->m;
1425:   info->columns_local  = (double)A->n;
1426:   info->block_size     = a->bs2;
1427:   info->nz_allocated   = a->maxnz;
1428:   info->nz_used        = a->bs2*a->nz;
1429:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1430:   info->assemblies   = A->num_ass;
1431:   info->mallocs      = a->reallocs;
1432:   info->memory       = A->mem;
1433:   if (A->factor) {
1434:     info->fill_ratio_given  = A->info.fill_ratio_given;
1435:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1436:     info->factor_mallocs    = A->info.factor_mallocs;
1437:   } else {
1438:     info->fill_ratio_given  = 0;
1439:     info->fill_ratio_needed = 0;
1440:     info->factor_mallocs    = 0;
1441:   }
1442:   return(0);
1443: }


1448: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1449: {
1450:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1454:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1455:   return(0);
1456: }