Actual source code: sbaij2.c
petsc-3.11.4 2019-09-28
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
12: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13: const PetscInt *idx;
14: PetscBT table_out,table_in;
17: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18: mbs = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = A->rmap->bs;
22: PetscBTCreate(mbs,&table_out);
23: PetscMalloc1(mbs+1,&nidx);
24: PetscMalloc1(A->rmap->N+1,&nidx2);
25: PetscBTCreate(mbs,&table_in);
27: for (i=0; i<is_max; i++) { /* for each is */
28: isz = 0;
29: PetscBTMemzero(mbs,table_out);
31: /* Extract the indices, assume there can be duplicate entries */
32: ISGetIndices(is[i],&idx);
33: ISGetLocalSize(is[i],&n);
35: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36: bcol_max = 0;
37: for (j=0; j<n; ++j) {
38: brow = idx[j]/bs; /* convert the indices into block indices */
39: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40: if (!PetscBTLookupSet(table_out,brow)) {
41: nidx[isz++] = brow;
42: if (bcol_max < brow) bcol_max = brow;
43: }
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap */
50: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51: PetscBTMemzero(mbs,table_in);
52: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
54: n = isz; /* length of the updated is[i] */
55: for (brow=0; brow<mbs; brow++) {
56: start = ai[brow]; end = ai[brow+1];
57: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58: for (l = start; l<end; l++) {
59: bcol = aj[l];
60: if (!PetscBTLookupSet(table_out,bcol)) {
61: nidx[isz++] = bcol;
62: if (bcol_max < bcol) bcol_max = bcol;
63: }
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l<end; l++) {
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table_in,bcol)) {
72: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
80: /* expand the Index Set */
81: for (j=0; j<isz; j++) {
82: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83: }
84: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
85: }
86: PetscBTDestroy(&table_out);
87: PetscFree(nidx);
88: PetscFree(nidx2);
89: PetscBTDestroy(&table_in);
90: return(0);
91: }
93: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94: Zero some ops' to avoid invalid usse */
95: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96: {
100: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
101: Bseq->ops->mult = 0;
102: Bseq->ops->multadd = 0;
103: Bseq->ops->multtranspose = 0;
104: Bseq->ops->multtransposeadd = 0;
105: Bseq->ops->lufactor = 0;
106: Bseq->ops->choleskyfactor = 0;
107: Bseq->ops->lufactorsymbolic = 0;
108: Bseq->ops->choleskyfactorsymbolic = 0;
109: Bseq->ops->getinertia = 0;
110: return(0);
111: }
113: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
114: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115: {
116: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
118: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
120: const PetscInt *irow,*icol;
121: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122: PetscInt *aj = a->j,*ai = a->i;
123: MatScalar *mat_a;
124: Mat C;
125: PetscBool flag;
129: ISGetIndices(isrow,&irow);
130: ISGetIndices(iscol,&icol);
131: ISGetLocalSize(isrow,&nrows);
132: ISGetLocalSize(iscol,&ncols);
134: PetscCalloc1(1+oldcols,&smap);
135: ssmap = smap;
136: PetscMalloc1(1+nrows,&lens);
137: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138: /* determine lens of each row */
139: for (i=0; i<nrows; i++) {
140: kstart = ai[irow[i]];
141: kend = kstart + a->ilen[irow[i]];
142: lens[i] = 0;
143: for (k=kstart; k<kend; k++) {
144: if (ssmap[aj[k]]) lens[i]++;
145: }
146: }
147: /* Create and fill new matrix */
148: if (scall == MAT_REUSE_MATRIX) {
149: c = (Mat_SeqSBAIJ*)((*B)->data);
151: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
153: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
155: C = *B;
156: } else {
157: MatCreate(PetscObjectComm((PetscObject)A),&C);
158: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
159: MatSetType(C,((PetscObject)A)->type_name);
160: MatSeqSBAIJSetPreallocation(C,bs,0,lens);
161: }
162: c = (Mat_SeqSBAIJ*)(C->data);
163: for (i=0; i<nrows; i++) {
164: row = irow[i];
165: kstart = ai[row];
166: kend = kstart + a->ilen[row];
167: mat_i = c->i[i];
168: mat_j = c->j + mat_i;
169: mat_a = c->a + mat_i*bs2;
170: mat_ilen = c->ilen + i;
171: for (k=kstart; k<kend; k++) {
172: if ((tcol=ssmap[a->j[k]])) {
173: *mat_j++ = tcol - 1;
174: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
175: mat_a += bs2;
176: (*mat_ilen)++;
177: }
178: }
179: }
180: /* sort */
181: {
182: MatScalar *work;
184: PetscMalloc1(bs2,&work);
185: for (i=0; i<nrows; i++) {
186: PetscInt ilen;
187: mat_i = c->i[i];
188: mat_j = c->j + mat_i;
189: mat_a = c->a + mat_i*bs2;
190: ilen = c->ilen[i];
191: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
192: }
193: PetscFree(work);
194: }
196: /* Free work space */
197: ISRestoreIndices(iscol,&icol);
198: PetscFree(smap);
199: PetscFree(lens);
200: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
201: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
203: ISRestoreIndices(isrow,&irow);
204: *B = C;
205: return(0);
206: }
208: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
209: {
210: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
211: IS is1,is2;
213: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214: const PetscInt *irow,*icol;
217: ISGetIndices(isrow,&irow);
218: ISGetIndices(iscol,&icol);
219: ISGetLocalSize(isrow,&nrows);
220: ISGetLocalSize(iscol,&ncols);
222: /* Verify if the indices corespond to each element in a block
223: and form the IS with compressed IS */
224: maxmnbs = PetscMax(a->mbs,a->nbs);
225: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
226: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
227: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228: for (i=0; i<a->mbs; i++) {
229: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230: }
231: count = 0;
232: for (i=0; i<nrows; i++) {
233: PetscInt j = irow[i] / bs;
234: if ((vary[j]--)==bs) iary[count++] = j;
235: }
236: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
238: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
239: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240: for (i=0; i<a->nbs; i++) {
241: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242: }
243: count = 0;
244: for (i=0; i<ncols; i++) {
245: PetscInt j = icol[i] / bs;
246: if ((vary[j]--)==bs) iary[count++] = j;
247: }
248: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
249: ISRestoreIndices(isrow,&irow);
250: ISRestoreIndices(iscol,&icol);
251: PetscFree2(vary,iary);
253: MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
254: ISDestroy(&is1);
255: ISDestroy(&is2);
257: if (isrow != iscol) {
258: PetscBool isequal;
259: ISEqual(isrow,iscol,&isequal);
260: if (!isequal) {
261: MatSeqSBAIJZeroOps_Private(*B);
262: }
263: }
264: return(0);
265: }
267: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
268: {
270: PetscInt i;
273: if (scall == MAT_INITIAL_MATRIX) {
274: PetscCalloc1(n+1,B);
275: }
277: for (i=0; i<n; i++) {
278: MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
279: }
280: return(0);
281: }
283: /* -------------------------------------------------------*/
284: /* Should check that shapes of vectors and matrices match */
285: /* -------------------------------------------------------*/
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
290: PetscScalar *z,x1,x2,zero=0.0;
291: const PetscScalar *x,*xb;
292: const MatScalar *v;
293: PetscErrorCode ierr;
294: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
295: const PetscInt *aj=a->j,*ai=a->i,*ib;
296: PetscInt nonzerorow=0;
299: VecSet(zz,zero);
300: VecGetArrayRead(xx,&x);
301: VecGetArray(zz,&z);
303: v = a->a;
304: xb = x;
306: for (i=0; i<mbs; i++) {
307: n = ai[1] - ai[0]; /* length of i_th block row of A */
308: x1 = xb[0]; x2 = xb[1];
309: ib = aj + *ai;
310: jmin = 0;
311: nonzerorow += (n>0);
312: if (*ib == i) { /* (diag of A)*x */
313: z[2*i] += v[0]*x1 + v[2]*x2;
314: z[2*i+1] += v[2]*x1 + v[3]*x2;
315: v += 4; jmin++;
316: }
317: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
318: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
319: for (j=jmin; j<n; j++) {
320: /* (strict lower triangular part of A)*x */
321: cval = ib[j]*2;
322: z[cval] += v[0]*x1 + v[1]*x2;
323: z[cval+1] += v[2]*x1 + v[3]*x2;
324: /* (strict upper triangular part of A)*x */
325: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
326: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
327: v += 4;
328: }
329: xb +=2; ai++;
330: }
332: VecRestoreArrayRead(xx,&x);
333: VecRestoreArray(zz,&z);
334: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
335: return(0);
336: }
338: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
339: {
340: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
341: PetscScalar *z,x1,x2,x3,zero=0.0;
342: const PetscScalar *x,*xb;
343: const MatScalar *v;
344: PetscErrorCode ierr;
345: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
346: const PetscInt *aj = a->j,*ai = a->i,*ib;
347: PetscInt nonzerorow=0;
350: VecSet(zz,zero);
351: VecGetArrayRead(xx,&x);
352: VecGetArray(zz,&z);
354: v = a->a;
355: xb = x;
357: for (i=0; i<mbs; i++) {
358: n = ai[1] - ai[0]; /* length of i_th block row of A */
359: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
360: ib = aj + *ai;
361: jmin = 0;
362: nonzerorow += (n>0);
363: if (*ib == i) { /* (diag of A)*x */
364: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
365: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
366: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
367: v += 9; jmin++;
368: }
369: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
370: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
371: for (j=jmin; j<n; j++) {
372: /* (strict lower triangular part of A)*x */
373: cval = ib[j]*3;
374: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
375: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
376: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
377: /* (strict upper triangular part of A)*x */
378: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
379: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
380: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
381: v += 9;
382: }
383: xb +=3; ai++;
384: }
386: VecRestoreArrayRead(xx,&x);
387: VecRestoreArray(zz,&z);
388: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
389: return(0);
390: }
392: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
393: {
394: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
395: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
396: const PetscScalar *x,*xb;
397: const MatScalar *v;
398: PetscErrorCode ierr;
399: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
400: const PetscInt *aj = a->j,*ai = a->i,*ib;
401: PetscInt nonzerorow = 0;
404: VecSet(zz,zero);
405: VecGetArrayRead(xx,&x);
406: VecGetArray(zz,&z);
408: v = a->a;
409: xb = x;
411: for (i=0; i<mbs; i++) {
412: n = ai[1] - ai[0]; /* length of i_th block row of A */
413: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
414: ib = aj + *ai;
415: jmin = 0;
416: nonzerorow += (n>0);
417: if (*ib == i) { /* (diag of A)*x */
418: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
419: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
420: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
421: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
422: v += 16; jmin++;
423: }
424: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
425: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
426: for (j=jmin; j<n; j++) {
427: /* (strict lower triangular part of A)*x */
428: cval = ib[j]*4;
429: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
430: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
431: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
432: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
433: /* (strict upper triangular part of A)*x */
434: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
435: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
436: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
437: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
438: v += 16;
439: }
440: xb +=4; ai++;
441: }
443: VecRestoreArrayRead(xx,&x);
444: VecRestoreArray(zz,&z);
445: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
446: return(0);
447: }
449: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
450: {
451: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
452: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
453: const PetscScalar *x,*xb;
454: const MatScalar *v;
455: PetscErrorCode ierr;
456: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
457: const PetscInt *aj = a->j,*ai = a->i,*ib;
458: PetscInt nonzerorow=0;
461: VecSet(zz,zero);
462: VecGetArrayRead(xx,&x);
463: VecGetArray(zz,&z);
465: v = a->a;
466: xb = x;
468: for (i=0; i<mbs; i++) {
469: n = ai[1] - ai[0]; /* length of i_th block row of A */
470: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
471: ib = aj + *ai;
472: jmin = 0;
473: nonzerorow += (n>0);
474: if (*ib == i) { /* (diag of A)*x */
475: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
476: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
477: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
478: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
479: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
480: v += 25; jmin++;
481: }
482: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
483: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
484: for (j=jmin; j<n; j++) {
485: /* (strict lower triangular part of A)*x */
486: cval = ib[j]*5;
487: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
488: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
489: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
490: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
491: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
492: /* (strict upper triangular part of A)*x */
493: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
494: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
495: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
496: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
497: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
498: v += 25;
499: }
500: xb +=5; ai++;
501: }
503: VecRestoreArrayRead(xx,&x);
504: VecRestoreArray(zz,&z);
505: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
506: return(0);
507: }
510: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
511: {
512: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
513: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
514: const PetscScalar *x,*xb;
515: const MatScalar *v;
516: PetscErrorCode ierr;
517: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
518: const PetscInt *aj=a->j,*ai=a->i,*ib;
519: PetscInt nonzerorow=0;
522: VecSet(zz,zero);
523: VecGetArrayRead(xx,&x);
524: VecGetArray(zz,&z);
526: v = a->a;
527: xb = x;
529: for (i=0; i<mbs; i++) {
530: n = ai[1] - ai[0]; /* length of i_th block row of A */
531: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
532: ib = aj + *ai;
533: jmin = 0;
534: nonzerorow += (n>0);
535: if (*ib == i) { /* (diag of A)*x */
536: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
537: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
538: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
539: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
540: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
541: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
542: v += 36; jmin++;
543: }
544: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
545: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
546: for (j=jmin; j<n; j++) {
547: /* (strict lower triangular part of A)*x */
548: cval = ib[j]*6;
549: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
550: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
551: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
552: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
553: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
554: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
555: /* (strict upper triangular part of A)*x */
556: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
557: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
558: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
559: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
560: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
561: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
562: v += 36;
563: }
564: xb +=6; ai++;
565: }
567: VecRestoreArrayRead(xx,&x);
568: VecRestoreArray(zz,&z);
569: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
570: return(0);
571: }
572: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
573: {
574: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
575: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
576: const PetscScalar *x,*xb;
577: const MatScalar *v;
578: PetscErrorCode ierr;
579: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
580: const PetscInt *aj=a->j,*ai=a->i,*ib;
581: PetscInt nonzerorow=0;
584: VecSet(zz,zero);
585: VecGetArrayRead(xx,&x);
586: VecGetArray(zz,&z);
588: v = a->a;
589: xb = x;
591: for (i=0; i<mbs; i++) {
592: n = ai[1] - ai[0]; /* length of i_th block row of A */
593: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
594: ib = aj + *ai;
595: jmin = 0;
596: nonzerorow += (n>0);
597: if (*ib == i) { /* (diag of A)*x */
598: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
599: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
600: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
601: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
602: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
603: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
604: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
605: v += 49; jmin++;
606: }
607: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
608: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
609: for (j=jmin; j<n; j++) {
610: /* (strict lower triangular part of A)*x */
611: cval = ib[j]*7;
612: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
613: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
614: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
615: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
616: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
617: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
618: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
619: /* (strict upper triangular part of A)*x */
620: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
621: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
622: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
623: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
624: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
625: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
626: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
627: v += 49;
628: }
629: xb +=7; ai++;
630: }
631: VecRestoreArrayRead(xx,&x);
632: VecRestoreArray(zz,&z);
633: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
634: return(0);
635: }
637: /*
638: This will not work with MatScalar == float because it calls the BLAS
639: */
640: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
641: {
642: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
643: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
644: const PetscScalar *x,*x_ptr,*xb;
645: const MatScalar *v;
646: PetscErrorCode ierr;
647: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
648: const PetscInt *idx,*aj,*ii;
649: PetscInt nonzerorow=0;
652: VecSet(zz,zero);
653: VecGetArrayRead(xx,&x);x_ptr = x;
654: VecGetArray(zz,&z); z_ptr=z;
656: aj = a->j;
657: v = a->a;
658: ii = a->i;
660: if (!a->mult_work) {
661: PetscMalloc1(A->rmap->N+1,&a->mult_work);
662: }
663: work = a->mult_work;
665: for (i=0; i<mbs; i++) {
666: n = ii[1] - ii[0]; ncols = n*bs;
667: workt = work; idx=aj+ii[0];
668: nonzerorow += (n>0);
670: /* upper triangular part */
671: for (j=0; j<n; j++) {
672: xb = x_ptr + bs*(*idx++);
673: for (k=0; k<bs; k++) workt[k] = xb[k];
674: workt += bs;
675: }
676: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
677: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
679: /* strict lower triangular part */
680: idx = aj+ii[0];
681: if (*idx == i) {
682: ncols -= bs; v += bs2; idx++; n--;
683: }
685: if (ncols > 0) {
686: workt = work;
687: PetscMemzero(workt,ncols*sizeof(PetscScalar));
688: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
689: for (j=0; j<n; j++) {
690: zb = z_ptr + bs*(*idx++);
691: for (k=0; k<bs; k++) zb[k] += workt[k];
692: workt += bs;
693: }
694: }
695: x += bs; v += n*bs2; z += bs; ii++;
696: }
698: VecRestoreArrayRead(xx,&x);
699: VecRestoreArray(zz,&z);
700: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
701: return(0);
702: }
704: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
705: {
706: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
707: PetscScalar *z,x1;
708: const PetscScalar *x,*xb;
709: const MatScalar *v;
710: PetscErrorCode ierr;
711: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
712: const PetscInt *aj=a->j,*ai=a->i,*ib;
713: PetscInt nonzerorow=0;
716: VecCopy(yy,zz);
717: VecGetArrayRead(xx,&x);
718: VecGetArray(zz,&z);
719: v = a->a;
720: xb = x;
722: for (i=0; i<mbs; i++) {
723: n = ai[1] - ai[0]; /* length of i_th row of A */
724: x1 = xb[0];
725: ib = aj + *ai;
726: jmin = 0;
727: nonzerorow += (n>0);
728: if (*ib == i) { /* (diag of A)*x */
729: z[i] += *v++ * x[*ib++]; jmin++;
730: }
731: for (j=jmin; j<n; j++) {
732: cval = *ib;
733: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
734: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
735: }
736: xb++; ai++;
737: }
739: VecRestoreArrayRead(xx,&x);
740: VecRestoreArray(zz,&z);
742: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
743: return(0);
744: }
746: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
747: {
748: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
749: PetscScalar *z,x1,x2;
750: const PetscScalar *x,*xb;
751: const MatScalar *v;
752: PetscErrorCode ierr;
753: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
754: const PetscInt *aj=a->j,*ai=a->i,*ib;
755: PetscInt nonzerorow=0;
758: VecCopy(yy,zz);
759: VecGetArrayRead(xx,&x);
760: VecGetArray(zz,&z);
762: v = a->a;
763: xb = x;
765: for (i=0; i<mbs; i++) {
766: n = ai[1] - ai[0]; /* length of i_th block row of A */
767: x1 = xb[0]; x2 = xb[1];
768: ib = aj + *ai;
769: jmin = 0;
770: nonzerorow += (n>0);
771: if (*ib == i) { /* (diag of A)*x */
772: z[2*i] += v[0]*x1 + v[2]*x2;
773: z[2*i+1] += v[2]*x1 + v[3]*x2;
774: v += 4; jmin++;
775: }
776: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
777: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
778: for (j=jmin; j<n; j++) {
779: /* (strict lower triangular part of A)*x */
780: cval = ib[j]*2;
781: z[cval] += v[0]*x1 + v[1]*x2;
782: z[cval+1] += v[2]*x1 + v[3]*x2;
783: /* (strict upper triangular part of A)*x */
784: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
785: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
786: v += 4;
787: }
788: xb +=2; ai++;
789: }
790: VecRestoreArrayRead(xx,&x);
791: VecRestoreArray(zz,&z);
793: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
794: return(0);
795: }
797: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
798: {
799: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
800: PetscScalar *z,x1,x2,x3;
801: const PetscScalar *x,*xb;
802: const MatScalar *v;
803: PetscErrorCode ierr;
804: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
805: const PetscInt *aj=a->j,*ai=a->i,*ib;
806: PetscInt nonzerorow=0;
809: VecCopy(yy,zz);
810: VecGetArrayRead(xx,&x);
811: VecGetArray(zz,&z);
813: v = a->a;
814: xb = x;
816: for (i=0; i<mbs; i++) {
817: n = ai[1] - ai[0]; /* length of i_th block row of A */
818: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
819: ib = aj + *ai;
820: jmin = 0;
821: nonzerorow += (n>0);
822: if (*ib == i) { /* (diag of A)*x */
823: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
824: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
825: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
826: v += 9; jmin++;
827: }
828: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
829: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
830: for (j=jmin; j<n; j++) {
831: /* (strict lower triangular part of A)*x */
832: cval = ib[j]*3;
833: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
834: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
835: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836: /* (strict upper triangular part of A)*x */
837: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
838: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
839: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
840: v += 9;
841: }
842: xb +=3; ai++;
843: }
845: VecRestoreArrayRead(xx,&x);
846: VecRestoreArray(zz,&z);
848: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
849: return(0);
850: }
852: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
853: {
854: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
855: PetscScalar *z,x1,x2,x3,x4;
856: const PetscScalar *x,*xb;
857: const MatScalar *v;
858: PetscErrorCode ierr;
859: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
860: const PetscInt *aj=a->j,*ai=a->i,*ib;
861: PetscInt nonzerorow=0;
864: VecCopy(yy,zz);
865: VecGetArrayRead(xx,&x);
866: VecGetArray(zz,&z);
868: v = a->a;
869: xb = x;
871: for (i=0; i<mbs; i++) {
872: n = ai[1] - ai[0]; /* length of i_th block row of A */
873: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
874: ib = aj + *ai;
875: jmin = 0;
876: nonzerorow += (n>0);
877: if (*ib == i) { /* (diag of A)*x */
878: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
879: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
880: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
881: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
882: v += 16; jmin++;
883: }
884: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
885: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886: for (j=jmin; j<n; j++) {
887: /* (strict lower triangular part of A)*x */
888: cval = ib[j]*4;
889: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
890: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
891: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
892: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
893: /* (strict upper triangular part of A)*x */
894: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
895: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
896: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
897: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
898: v += 16;
899: }
900: xb +=4; ai++;
901: }
903: VecRestoreArrayRead(xx,&x);
904: VecRestoreArray(zz,&z);
906: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
907: return(0);
908: }
910: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
911: {
912: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
913: PetscScalar *z,x1,x2,x3,x4,x5;
914: const PetscScalar *x,*xb;
915: const MatScalar *v;
916: PetscErrorCode ierr;
917: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
918: const PetscInt *aj=a->j,*ai=a->i,*ib;
919: PetscInt nonzerorow=0;
922: VecCopy(yy,zz);
923: VecGetArrayRead(xx,&x);
924: VecGetArray(zz,&z);
926: v = a->a;
927: xb = x;
929: for (i=0; i<mbs; i++) {
930: n = ai[1] - ai[0]; /* length of i_th block row of A */
931: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
932: ib = aj + *ai;
933: jmin = 0;
934: nonzerorow += (n>0);
935: if (*ib == i) { /* (diag of A)*x */
936: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
937: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
938: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
939: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
940: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
941: v += 25; jmin++;
942: }
943: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
944: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
945: for (j=jmin; j<n; j++) {
946: /* (strict lower triangular part of A)*x */
947: cval = ib[j]*5;
948: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
949: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
950: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
951: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
952: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953: /* (strict upper triangular part of A)*x */
954: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
955: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
956: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
957: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
958: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
959: v += 25;
960: }
961: xb +=5; ai++;
962: }
964: VecRestoreArrayRead(xx,&x);
965: VecRestoreArray(zz,&z);
967: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
968: return(0);
969: }
970: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
971: {
972: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
973: PetscScalar *z,x1,x2,x3,x4,x5,x6;
974: const PetscScalar *x,*xb;
975: const MatScalar *v;
976: PetscErrorCode ierr;
977: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
978: const PetscInt *aj=a->j,*ai=a->i,*ib;
979: PetscInt nonzerorow=0;
982: VecCopy(yy,zz);
983: VecGetArrayRead(xx,&x);
984: VecGetArray(zz,&z);
986: v = a->a;
987: xb = x;
989: for (i=0; i<mbs; i++) {
990: n = ai[1] - ai[0]; /* length of i_th block row of A */
991: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
992: ib = aj + *ai;
993: jmin = 0;
994: nonzerorow += (n>0);
995: if (*ib == i) { /* (diag of A)*x */
996: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
997: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
998: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
999: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1000: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1001: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1002: v += 36; jmin++;
1003: }
1004: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1005: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1006: for (j=jmin; j<n; j++) {
1007: /* (strict lower triangular part of A)*x */
1008: cval = ib[j]*6;
1009: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1010: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1011: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1012: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1013: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1014: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015: /* (strict upper triangular part of A)*x */
1016: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1017: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1018: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1019: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1020: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1021: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1022: v += 36;
1023: }
1024: xb +=6; ai++;
1025: }
1027: VecRestoreArrayRead(xx,&x);
1028: VecRestoreArray(zz,&z);
1030: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1031: return(0);
1032: }
1034: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1035: {
1036: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1037: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1038: const PetscScalar *x,*xb;
1039: const MatScalar *v;
1040: PetscErrorCode ierr;
1041: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1042: const PetscInt *aj=a->j,*ai=a->i,*ib;
1043: PetscInt nonzerorow=0;
1046: VecCopy(yy,zz);
1047: VecGetArrayRead(xx,&x);
1048: VecGetArray(zz,&z);
1050: v = a->a;
1051: xb = x;
1053: for (i=0; i<mbs; i++) {
1054: n = ai[1] - ai[0]; /* length of i_th block row of A */
1055: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1056: ib = aj + *ai;
1057: jmin = 0;
1058: nonzerorow += (n>0);
1059: if (*ib == i) { /* (diag of A)*x */
1060: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1061: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1062: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1063: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1064: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1065: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1066: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1067: v += 49; jmin++;
1068: }
1069: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1070: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1071: for (j=jmin; j<n; j++) {
1072: /* (strict lower triangular part of A)*x */
1073: cval = ib[j]*7;
1074: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1075: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1076: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1077: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1078: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1079: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1080: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081: /* (strict upper triangular part of A)*x */
1082: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1083: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1084: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1085: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1086: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1087: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1088: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1089: v += 49;
1090: }
1091: xb +=7; ai++;
1092: }
1094: VecRestoreArrayRead(xx,&x);
1095: VecRestoreArray(zz,&z);
1097: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1098: return(0);
1099: }
1101: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1102: {
1103: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1104: PetscScalar *z,*z_ptr=0,*zb,*work,*workt;
1105: const PetscScalar *x,*x_ptr,*xb;
1106: const MatScalar *v;
1107: PetscErrorCode ierr;
1108: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1109: const PetscInt *idx,*aj,*ii;
1110: PetscInt nonzerorow=0;
1113: VecCopy(yy,zz);
1114: VecGetArrayRead(xx,&x); x_ptr=x;
1115: VecGetArray(zz,&z); z_ptr=z;
1117: aj = a->j;
1118: v = a->a;
1119: ii = a->i;
1121: if (!a->mult_work) {
1122: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1123: }
1124: work = a->mult_work;
1127: for (i=0; i<mbs; i++) {
1128: n = ii[1] - ii[0]; ncols = n*bs;
1129: workt = work; idx=aj+ii[0];
1130: nonzerorow += (n>0);
1132: /* upper triangular part */
1133: for (j=0; j<n; j++) {
1134: xb = x_ptr + bs*(*idx++);
1135: for (k=0; k<bs; k++) workt[k] = xb[k];
1136: workt += bs;
1137: }
1138: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1139: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1141: /* strict lower triangular part */
1142: idx = aj+ii[0];
1143: if (*idx == i) {
1144: ncols -= bs; v += bs2; idx++; n--;
1145: }
1146: if (ncols > 0) {
1147: workt = work;
1148: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1149: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1150: for (j=0; j<n; j++) {
1151: zb = z_ptr + bs*(*idx++);
1152: for (k=0; k<bs; k++) zb[k] += workt[k];
1153: workt += bs;
1154: }
1155: }
1157: x += bs; v += n*bs2; z += bs; ii++;
1158: }
1160: VecRestoreArrayRead(xx,&x);
1161: VecRestoreArray(zz,&z);
1163: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1164: return(0);
1165: }
1167: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1168: {
1169: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1170: PetscScalar oalpha = alpha;
1172: PetscBLASInt one = 1,totalnz;
1175: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1176: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1177: PetscLogFlops(totalnz);
1178: return(0);
1179: }
1181: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1182: {
1183: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1184: const MatScalar *v = a->a;
1185: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1186: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1187: PetscErrorCode ierr;
1188: const PetscInt *aj=a->j,*col;
1191: if (type == NORM_FROBENIUS) {
1192: for (k=0; k<mbs; k++) {
1193: jmin = a->i[k]; jmax = a->i[k+1];
1194: col = aj + jmin;
1195: if (*col == k) { /* diagonal block */
1196: for (i=0; i<bs2; i++) {
1197: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1198: }
1199: jmin++;
1200: }
1201: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1202: for (i=0; i<bs2; i++) {
1203: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1204: }
1205: }
1206: }
1207: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1208: PetscLogFlops(2*bs2*a->nz);
1209: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1210: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1211: for (i=0; i<mbs; i++) jl[i] = mbs;
1212: il[0] = 0;
1214: *norm = 0.0;
1215: for (k=0; k<mbs; k++) { /* k_th block row */
1216: for (j=0; j<bs; j++) sum[j]=0.0;
1217: /*-- col sum --*/
1218: i = jl[k]; /* first |A(i,k)| to be added */
1219: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1220: at step k */
1221: while (i<mbs) {
1222: nexti = jl[i]; /* next block row to be added */
1223: ik = il[i]; /* block index of A(i,k) in the array a */
1224: for (j=0; j<bs; j++) {
1225: v = a->a + ik*bs2 + j*bs;
1226: for (k1=0; k1<bs; k1++) {
1227: sum[j] += PetscAbsScalar(*v); v++;
1228: }
1229: }
1230: /* update il, jl */
1231: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1232: jmax = a->i[i+1];
1233: if (jmin < jmax) {
1234: il[i] = jmin;
1235: j = a->j[jmin];
1236: jl[i] = jl[j]; jl[j]=i;
1237: }
1238: i = nexti;
1239: }
1240: /*-- row sum --*/
1241: jmin = a->i[k]; jmax = a->i[k+1];
1242: for (i=jmin; i<jmax; i++) {
1243: for (j=0; j<bs; j++) {
1244: v = a->a + i*bs2 + j;
1245: for (k1=0; k1<bs; k1++) {
1246: sum[j] += PetscAbsScalar(*v); v += bs;
1247: }
1248: }
1249: }
1250: /* add k_th block row to il, jl */
1251: col = aj+jmin;
1252: if (*col == k) jmin++;
1253: if (jmin < jmax) {
1254: il[k] = jmin;
1255: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1256: }
1257: for (j=0; j<bs; j++) {
1258: if (sum[j] > *norm) *norm = sum[j];
1259: }
1260: }
1261: PetscFree3(sum,il,jl);
1262: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1263: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1264: return(0);
1265: }
1267: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1268: {
1269: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1273: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1274: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1275: *flg = PETSC_FALSE;
1276: return(0);
1277: }
1279: /* if the a->i are the same */
1280: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1281: if (!*flg) return(0);
1283: /* if a->j are the same */
1284: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1285: if (!*flg) return(0);
1287: /* if a->a are the same */
1288: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1289: return(0);
1290: }
1292: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1293: {
1294: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1295: PetscErrorCode ierr;
1296: PetscInt i,j,k,row,bs,ambs,bs2;
1297: const PetscInt *ai,*aj;
1298: PetscScalar *x,zero = 0.0;
1299: const MatScalar *aa,*aa_j;
1302: bs = A->rmap->bs;
1303: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1305: aa = a->a;
1306: ambs = a->mbs;
1308: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1309: PetscInt *diag=a->diag;
1310: aa = a->a;
1311: ambs = a->mbs;
1312: VecGetArray(v,&x);
1313: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1314: VecRestoreArray(v,&x);
1315: return(0);
1316: }
1318: ai = a->i;
1319: aj = a->j;
1320: bs2 = a->bs2;
1321: VecSet(v,zero);
1322: VecGetArray(v,&x);
1323: for (i=0; i<ambs; i++) {
1324: j=ai[i];
1325: if (aj[j] == i) { /* if this is a diagonal element */
1326: row = i*bs;
1327: aa_j = aa + j*bs2;
1328: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1329: }
1330: }
1331: VecRestoreArray(v,&x);
1332: return(0);
1333: }
1335: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1336: {
1337: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1338: PetscScalar x;
1339: const PetscScalar *l,*li,*ri;
1340: MatScalar *aa,*v;
1341: PetscErrorCode ierr;
1342: PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1343: const PetscInt *ai,*aj;
1344: PetscBool flg;
1347: if (ll != rr) {
1348: VecEqual(ll,rr,&flg);
1349: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1350: }
1351: if (!ll) return(0);
1352: ai = a->i;
1353: aj = a->j;
1354: aa = a->a;
1355: m = A->rmap->N;
1356: bs = A->rmap->bs;
1357: mbs = a->mbs;
1358: bs2 = a->bs2;
1360: VecGetArrayRead(ll,&l);
1361: VecGetLocalSize(ll,&lm);
1362: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1363: for (i=0; i<mbs; i++) { /* for each block row */
1364: M = ai[i+1] - ai[i];
1365: li = l + i*bs;
1366: v = aa + bs2*ai[i];
1367: for (j=0; j<M; j++) { /* for each block */
1368: ri = l + bs*aj[ai[i]+j];
1369: for (k=0; k<bs; k++) {
1370: x = ri[k];
1371: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1372: }
1373: }
1374: }
1375: VecRestoreArrayRead(ll,&l);
1376: PetscLogFlops(2.0*a->nz);
1377: return(0);
1378: }
1380: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1381: {
1382: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1385: info->block_size = a->bs2;
1386: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1387: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1388: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1389: info->assemblies = A->num_ass;
1390: info->mallocs = A->info.mallocs;
1391: info->memory = ((PetscObject)A)->mem;
1392: if (A->factortype) {
1393: info->fill_ratio_given = A->info.fill_ratio_given;
1394: info->fill_ratio_needed = A->info.fill_ratio_needed;
1395: info->factor_mallocs = A->info.factor_mallocs;
1396: } else {
1397: info->fill_ratio_given = 0;
1398: info->fill_ratio_needed = 0;
1399: info->factor_mallocs = 0;
1400: }
1401: return(0);
1402: }
1405: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1406: {
1407: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1411: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1412: return(0);
1413: }
1415: /*
1416: This code does not work since it only checks the upper triangular part of
1417: the matrix. Hence it is not listed in the function table.
1418: */
1419: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1420: {
1421: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1422: PetscErrorCode ierr;
1423: PetscInt i,j,n,row,col,bs,mbs;
1424: const PetscInt *ai,*aj;
1425: PetscReal atmp;
1426: const MatScalar *aa;
1427: PetscScalar *x;
1428: PetscInt ncols,brow,bcol,krow,kcol;
1431: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1432: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1433: bs = A->rmap->bs;
1434: aa = a->a;
1435: ai = a->i;
1436: aj = a->j;
1437: mbs = a->mbs;
1439: VecSet(v,0.0);
1440: VecGetArray(v,&x);
1441: VecGetLocalSize(v,&n);
1442: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1443: for (i=0; i<mbs; i++) {
1444: ncols = ai[1] - ai[0]; ai++;
1445: brow = bs*i;
1446: for (j=0; j<ncols; j++) {
1447: bcol = bs*(*aj);
1448: for (kcol=0; kcol<bs; kcol++) {
1449: col = bcol + kcol; /* col index */
1450: for (krow=0; krow<bs; krow++) {
1451: atmp = PetscAbsScalar(*aa); aa++;
1452: row = brow + krow; /* row index */
1453: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1454: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1455: }
1456: }
1457: aj++;
1458: }
1459: }
1460: VecRestoreArray(v,&x);
1461: return(0);
1462: }