Actual source code: sbaij2.c
petsc-3.14.6 2021-03-30
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
13: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14: const PetscInt *idx;
15: PetscBT table_out,table_in;
18: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19: mbs = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
23: PetscBTCreate(mbs,&table_out);
24: PetscMalloc1(mbs+1,&nidx);
25: PetscMalloc1(A->rmap->N+1,&nidx2);
26: PetscBTCreate(mbs,&table_in);
28: for (i=0; i<is_max; i++) { /* for each is */
29: isz = 0;
30: PetscBTMemzero(mbs,table_out);
32: /* Extract the indices, assume there can be duplicate entries */
33: ISGetIndices(is[i],&idx);
34: ISGetLocalSize(is[i],&n);
36: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37: bcol_max = 0;
38: for (j=0; j<n; ++j) {
39: brow = idx[j]/bs; /* convert the indices into block indices */
40: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41: if (!PetscBTLookupSet(table_out,brow)) {
42: nidx[isz++] = brow;
43: if (bcol_max < brow) bcol_max = brow;
44: }
45: }
46: ISRestoreIndices(is[i],&idx);
47: ISDestroy(&is[i]);
49: k = 0;
50: for (j=0; j<ov; j++) { /* for each overlap */
51: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52: PetscBTMemzero(mbs,table_in);
53: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
55: n = isz; /* length of the updated is[i] */
56: for (brow=0; brow<mbs; brow++) {
57: start = ai[brow]; end = ai[brow+1];
58: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59: for (l = start; l<end; l++) {
60: bcol = aj[l];
61: if (!PetscBTLookupSet(table_out,bcol)) {
62: nidx[isz++] = bcol;
63: if (bcol_max < bcol) bcol_max = bcol;
64: }
65: }
66: k++;
67: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69: for (l = start; l<end; l++) {
70: bcol = aj[l];
71: if (bcol > bcol_max) break;
72: if (PetscBTLookup(table_in,bcol)) {
73: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74: break; /* for l = start; l<end ; l++) */
75: }
76: }
77: }
78: }
79: } /* for each overlap */
81: /* expand the Index Set */
82: for (j=0; j<isz; j++) {
83: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84: }
85: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
86: }
87: PetscBTDestroy(&table_out);
88: PetscFree(nidx);
89: PetscFree(nidx2);
90: PetscBTDestroy(&table_in);
91: return(0);
92: }
94: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95: Zero some ops' to avoid invalid usse */
96: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97: {
101: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
102: Bseq->ops->mult = NULL;
103: Bseq->ops->multadd = NULL;
104: Bseq->ops->multtranspose = NULL;
105: Bseq->ops->multtransposeadd = NULL;
106: Bseq->ops->lufactor = NULL;
107: Bseq->ops->choleskyfactor = NULL;
108: Bseq->ops->lufactorsymbolic = NULL;
109: Bseq->ops->choleskyfactorsymbolic = NULL;
110: Bseq->ops->getinertia = NULL;
111: return(0);
112: }
114: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
115: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116: {
117: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
119: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
121: const PetscInt *irow,*icol;
122: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123: PetscInt *aj = a->j,*ai = a->i;
124: MatScalar *mat_a;
125: Mat C;
126: PetscBool flag;
130: ISGetIndices(isrow,&irow);
131: ISGetIndices(iscol,&icol);
132: ISGetLocalSize(isrow,&nrows);
133: ISGetLocalSize(iscol,&ncols);
135: PetscCalloc1(1+oldcols,&smap);
136: ssmap = smap;
137: PetscMalloc1(1+nrows,&lens);
138: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139: /* determine lens of each row */
140: for (i=0; i<nrows; i++) {
141: kstart = ai[irow[i]];
142: kend = kstart + a->ilen[irow[i]];
143: lens[i] = 0;
144: for (k=kstart; k<kend; k++) {
145: if (ssmap[aj[k]]) lens[i]++;
146: }
147: }
148: /* Create and fill new matrix */
149: if (scall == MAT_REUSE_MATRIX) {
150: c = (Mat_SeqSBAIJ*)((*B)->data);
152: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
154: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155: PetscArrayzero(c->ilen,c->mbs);
156: C = *B;
157: } else {
158: MatCreate(PetscObjectComm((PetscObject)A),&C);
159: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
160: MatSetType(C,((PetscObject)A)->type_name);
161: MatSeqSBAIJSetPreallocation(C,bs,0,lens);
162: }
163: c = (Mat_SeqSBAIJ*)(C->data);
164: for (i=0; i<nrows; i++) {
165: row = irow[i];
166: kstart = ai[row];
167: kend = kstart + a->ilen[row];
168: mat_i = c->i[i];
169: mat_j = c->j + mat_i;
170: mat_a = c->a + mat_i*bs2;
171: mat_ilen = c->ilen + i;
172: for (k=kstart; k<kend; k++) {
173: if ((tcol=ssmap[a->j[k]])) {
174: *mat_j++ = tcol - 1;
175: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
176: mat_a += bs2;
177: (*mat_ilen)++;
178: }
179: }
180: }
181: /* sort */
182: {
183: MatScalar *work;
185: PetscMalloc1(bs2,&work);
186: for (i=0; i<nrows; i++) {
187: PetscInt ilen;
188: mat_i = c->i[i];
189: mat_j = c->j + mat_i;
190: mat_a = c->a + mat_i*bs2;
191: ilen = c->ilen[i];
192: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
193: }
194: PetscFree(work);
195: }
197: /* Free work space */
198: ISRestoreIndices(iscol,&icol);
199: PetscFree(smap);
200: PetscFree(lens);
201: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
202: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
204: ISRestoreIndices(isrow,&irow);
205: *B = C;
206: return(0);
207: }
209: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210: {
211: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
212: IS is1,is2;
214: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215: const PetscInt *irow,*icol;
218: ISGetIndices(isrow,&irow);
219: ISGetIndices(iscol,&icol);
220: ISGetLocalSize(isrow,&nrows);
221: ISGetLocalSize(iscol,&ncols);
223: /* Verify if the indices corespond to each element in a block
224: and form the IS with compressed IS */
225: maxmnbs = PetscMax(a->mbs,a->nbs);
226: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
227: PetscArrayzero(vary,a->mbs);
228: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229: for (i=0; i<a->mbs; i++) {
230: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231: }
232: count = 0;
233: for (i=0; i<nrows; i++) {
234: PetscInt j = irow[i] / bs;
235: if ((vary[j]--)==bs) iary[count++] = j;
236: }
237: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
239: PetscArrayzero(vary,a->nbs);
240: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241: for (i=0; i<a->nbs; i++) {
242: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243: }
244: count = 0;
245: for (i=0; i<ncols; i++) {
246: PetscInt j = icol[i] / bs;
247: if ((vary[j]--)==bs) iary[count++] = j;
248: }
249: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
250: ISRestoreIndices(isrow,&irow);
251: ISRestoreIndices(iscol,&icol);
252: PetscFree2(vary,iary);
254: MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
255: ISDestroy(&is1);
256: ISDestroy(&is2);
258: if (isrow != iscol) {
259: PetscBool isequal;
260: ISEqual(isrow,iscol,&isequal);
261: if (!isequal) {
262: MatSeqSBAIJZeroOps_Private(*B);
263: }
264: }
265: return(0);
266: }
268: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269: {
271: PetscInt i;
274: if (scall == MAT_INITIAL_MATRIX) {
275: PetscCalloc1(n+1,B);
276: }
278: for (i=0; i<n; i++) {
279: MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
280: }
281: return(0);
282: }
284: /* -------------------------------------------------------*/
285: /* Should check that shapes of vectors and matrices match */
286: /* -------------------------------------------------------*/
288: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289: {
290: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
291: PetscScalar *z,x1,x2,zero=0.0;
292: const PetscScalar *x,*xb;
293: const MatScalar *v;
294: PetscErrorCode ierr;
295: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
296: const PetscInt *aj=a->j,*ai=a->i,*ib;
297: PetscInt nonzerorow=0;
300: VecSet(zz,zero);
301: if (!a->nz) return(0);
302: VecGetArrayRead(xx,&x);
303: VecGetArray(zz,&z);
305: v = a->a;
306: xb = x;
308: for (i=0; i<mbs; i++) {
309: n = ai[1] - ai[0]; /* length of i_th block row of A */
310: x1 = xb[0]; x2 = xb[1];
311: ib = aj + *ai;
312: jmin = 0;
313: nonzerorow += (n>0);
314: if (*ib == i) { /* (diag of A)*x */
315: z[2*i] += v[0]*x1 + v[2]*x2;
316: z[2*i+1] += v[2]*x1 + v[3]*x2;
317: v += 4; jmin++;
318: }
319: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321: for (j=jmin; j<n; j++) {
322: /* (strict lower triangular part of A)*x */
323: cval = ib[j]*2;
324: z[cval] += v[0]*x1 + v[1]*x2;
325: z[cval+1] += v[2]*x1 + v[3]*x2;
326: /* (strict upper triangular part of A)*x */
327: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
328: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329: v += 4;
330: }
331: xb +=2; ai++;
332: }
334: VecRestoreArrayRead(xx,&x);
335: VecRestoreArray(zz,&z);
336: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
337: return(0);
338: }
340: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341: {
342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
343: PetscScalar *z,x1,x2,x3,zero=0.0;
344: const PetscScalar *x,*xb;
345: const MatScalar *v;
346: PetscErrorCode ierr;
347: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
348: const PetscInt *aj = a->j,*ai = a->i,*ib;
349: PetscInt nonzerorow=0;
352: VecSet(zz,zero);
353: if (!a->nz) return(0);
354: VecGetArrayRead(xx,&x);
355: VecGetArray(zz,&z);
357: v = a->a;
358: xb = x;
360: for (i=0; i<mbs; i++) {
361: n = ai[1] - ai[0]; /* length of i_th block row of A */
362: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
363: ib = aj + *ai;
364: jmin = 0;
365: nonzerorow += (n>0);
366: if (*ib == i) { /* (diag of A)*x */
367: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
368: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370: v += 9; jmin++;
371: }
372: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
374: for (j=jmin; j<n; j++) {
375: /* (strict lower triangular part of A)*x */
376: cval = ib[j]*3;
377: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
378: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380: /* (strict upper triangular part of A)*x */
381: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384: v += 9;
385: }
386: xb +=3; ai++;
387: }
389: VecRestoreArrayRead(xx,&x);
390: VecRestoreArray(zz,&z);
391: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
392: return(0);
393: }
395: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396: {
397: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
398: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
399: const PetscScalar *x,*xb;
400: const MatScalar *v;
401: PetscErrorCode ierr;
402: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
403: const PetscInt *aj = a->j,*ai = a->i,*ib;
404: PetscInt nonzerorow = 0;
407: VecSet(zz,zero);
408: if (!a->nz) return(0);
409: VecGetArrayRead(xx,&x);
410: VecGetArray(zz,&z);
412: v = a->a;
413: xb = x;
415: for (i=0; i<mbs; i++) {
416: n = ai[1] - ai[0]; /* length of i_th block row of A */
417: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418: ib = aj + *ai;
419: jmin = 0;
420: nonzerorow += (n>0);
421: if (*ib == i) { /* (diag of A)*x */
422: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
423: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
424: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426: v += 16; jmin++;
427: }
428: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430: for (j=jmin; j<n; j++) {
431: /* (strict lower triangular part of A)*x */
432: cval = ib[j]*4;
433: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437: /* (strict upper triangular part of A)*x */
438: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442: v += 16;
443: }
444: xb +=4; ai++;
445: }
447: VecRestoreArrayRead(xx,&x);
448: VecRestoreArray(zz,&z);
449: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
450: return(0);
451: }
453: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454: {
455: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
456: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
457: const PetscScalar *x,*xb;
458: const MatScalar *v;
459: PetscErrorCode ierr;
460: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
461: const PetscInt *aj = a->j,*ai = a->i,*ib;
462: PetscInt nonzerorow=0;
465: VecSet(zz,zero);
466: if (!a->nz) return(0);
467: VecGetArrayRead(xx,&x);
468: VecGetArray(zz,&z);
470: v = a->a;
471: xb = x;
473: for (i=0; i<mbs; i++) {
474: n = ai[1] - ai[0]; /* length of i_th block row of A */
475: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476: ib = aj + *ai;
477: jmin = 0;
478: nonzerorow += (n>0);
479: if (*ib == i) { /* (diag of A)*x */
480: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485: v += 25; jmin++;
486: }
487: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489: for (j=jmin; j<n; j++) {
490: /* (strict lower triangular part of A)*x */
491: cval = ib[j]*5;
492: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497: /* (strict upper triangular part of A)*x */
498: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503: v += 25;
504: }
505: xb +=5; ai++;
506: }
508: VecRestoreArrayRead(xx,&x);
509: VecRestoreArray(zz,&z);
510: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
511: return(0);
512: }
514: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
517: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518: const PetscScalar *x,*xb;
519: const MatScalar *v;
520: PetscErrorCode ierr;
521: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
522: const PetscInt *aj=a->j,*ai=a->i,*ib;
523: PetscInt nonzerorow=0;
526: VecSet(zz,zero);
527: if (!a->nz) return(0);
528: VecGetArrayRead(xx,&x);
529: VecGetArray(zz,&z);
531: v = a->a;
532: xb = x;
534: for (i=0; i<mbs; i++) {
535: n = ai[1] - ai[0]; /* length of i_th block row of A */
536: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537: ib = aj + *ai;
538: jmin = 0;
539: nonzerorow += (n>0);
540: if (*ib == i) { /* (diag of A)*x */
541: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547: v += 36; jmin++;
548: }
549: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551: for (j=jmin; j<n; j++) {
552: /* (strict lower triangular part of A)*x */
553: cval = ib[j]*6;
554: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560: /* (strict upper triangular part of A)*x */
561: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567: v += 36;
568: }
569: xb +=6; ai++;
570: }
572: VecRestoreArrayRead(xx,&x);
573: VecRestoreArray(zz,&z);
574: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
575: return(0);
576: }
578: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579: {
580: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
581: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582: const PetscScalar *x,*xb;
583: const MatScalar *v;
584: PetscErrorCode ierr;
585: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
586: const PetscInt *aj=a->j,*ai=a->i,*ib;
587: PetscInt nonzerorow=0;
590: VecSet(zz,zero);
591: if (!a->nz) return(0);
592: VecGetArrayRead(xx,&x);
593: VecGetArray(zz,&z);
595: v = a->a;
596: xb = x;
598: for (i=0; i<mbs; i++) {
599: n = ai[1] - ai[0]; /* length of i_th block row of A */
600: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601: ib = aj + *ai;
602: jmin = 0;
603: nonzerorow += (n>0);
604: if (*ib == i) { /* (diag of A)*x */
605: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612: v += 49; jmin++;
613: }
614: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616: for (j=jmin; j<n; j++) {
617: /* (strict lower triangular part of A)*x */
618: cval = ib[j]*7;
619: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626: /* (strict upper triangular part of A)*x */
627: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634: v += 49;
635: }
636: xb +=7; ai++;
637: }
638: VecRestoreArrayRead(xx,&x);
639: VecRestoreArray(zz,&z);
640: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
641: return(0);
642: }
644: /*
645: This will not work with MatScalar == float because it calls the BLAS
646: */
647: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648: {
649: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
650: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651: const PetscScalar *x,*x_ptr,*xb;
652: const MatScalar *v;
653: PetscErrorCode ierr;
654: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655: const PetscInt *idx,*aj,*ii;
656: PetscInt nonzerorow=0;
659: VecSet(zz,zero);
660: if (!a->nz) return(0);
661: VecGetArrayRead(xx,&x);
662: VecGetArray(zz,&z);
664: x_ptr = x;
665: z_ptr = z;
667: aj = a->j;
668: v = a->a;
669: ii = a->i;
671: if (!a->mult_work) {
672: PetscMalloc1(A->rmap->N+1,&a->mult_work);
673: }
674: work = a->mult_work;
676: for (i=0; i<mbs; i++) {
677: n = ii[1] - ii[0]; ncols = n*bs;
678: workt = work; idx=aj+ii[0];
679: nonzerorow += (n>0);
681: /* upper triangular part */
682: for (j=0; j<n; j++) {
683: xb = x_ptr + bs*(*idx++);
684: for (k=0; k<bs; k++) workt[k] = xb[k];
685: workt += bs;
686: }
687: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
688: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
690: /* strict lower triangular part */
691: idx = aj+ii[0];
692: if (n && *idx == i) {
693: ncols -= bs; v += bs2; idx++; n--;
694: }
696: if (ncols > 0) {
697: workt = work;
698: PetscArrayzero(workt,ncols);
699: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
700: for (j=0; j<n; j++) {
701: zb = z_ptr + bs*(*idx++);
702: for (k=0; k<bs; k++) zb[k] += workt[k];
703: workt += bs;
704: }
705: }
706: x += bs; v += n*bs2; z += bs; ii++;
707: }
709: VecRestoreArrayRead(xx,&x);
710: VecRestoreArray(zz,&z);
711: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
712: return(0);
713: }
715: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
716: {
717: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
718: PetscScalar *z,x1;
719: const PetscScalar *x,*xb;
720: const MatScalar *v;
721: PetscErrorCode ierr;
722: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
723: const PetscInt *aj=a->j,*ai=a->i,*ib;
724: PetscInt nonzerorow=0;
725: #if defined(PETSC_USE_COMPLEX)
726: const int aconj = A->hermitian;
727: #else
728: const int aconj = 0;
729: #endif
732: VecCopy(yy,zz);
733: if (!a->nz) return(0);
734: VecGetArrayRead(xx,&x);
735: VecGetArray(zz,&z);
736: v = a->a;
737: xb = x;
739: for (i=0; i<mbs; i++) {
740: n = ai[1] - ai[0]; /* length of i_th row of A */
741: x1 = xb[0];
742: ib = aj + *ai;
743: jmin = 0;
744: nonzerorow += (n>0);
745: if (n && *ib == i) { /* (diag of A)*x */
746: z[i] += *v++ * x[*ib++]; jmin++;
747: }
748: if (aconj) {
749: for (j=jmin; j<n; j++) {
750: cval = *ib;
751: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
752: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
753: }
754: } else {
755: for (j=jmin; j<n; j++) {
756: cval = *ib;
757: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
758: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
759: }
760: }
761: xb++; ai++;
762: }
764: VecRestoreArrayRead(xx,&x);
765: VecRestoreArray(zz,&z);
767: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
768: return(0);
769: }
771: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
772: {
773: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
774: PetscScalar *z,x1,x2;
775: const PetscScalar *x,*xb;
776: const MatScalar *v;
777: PetscErrorCode ierr;
778: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
779: const PetscInt *aj=a->j,*ai=a->i,*ib;
780: PetscInt nonzerorow=0;
783: VecCopy(yy,zz);
784: if (!a->nz) return(0);
785: VecGetArrayRead(xx,&x);
786: VecGetArray(zz,&z);
788: v = a->a;
789: xb = x;
791: for (i=0; i<mbs; i++) {
792: n = ai[1] - ai[0]; /* length of i_th block row of A */
793: x1 = xb[0]; x2 = xb[1];
794: ib = aj + *ai;
795: jmin = 0;
796: nonzerorow += (n>0);
797: if (n && *ib == i) { /* (diag of A)*x */
798: z[2*i] += v[0]*x1 + v[2]*x2;
799: z[2*i+1] += v[2]*x1 + v[3]*x2;
800: v += 4; jmin++;
801: }
802: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
803: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
804: for (j=jmin; j<n; j++) {
805: /* (strict lower triangular part of A)*x */
806: cval = ib[j]*2;
807: z[cval] += v[0]*x1 + v[1]*x2;
808: z[cval+1] += v[2]*x1 + v[3]*x2;
809: /* (strict upper triangular part of A)*x */
810: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
811: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
812: v += 4;
813: }
814: xb +=2; ai++;
815: }
816: VecRestoreArrayRead(xx,&x);
817: VecRestoreArray(zz,&z);
819: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
820: return(0);
821: }
823: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
824: {
825: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
826: PetscScalar *z,x1,x2,x3;
827: const PetscScalar *x,*xb;
828: const MatScalar *v;
829: PetscErrorCode ierr;
830: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
831: const PetscInt *aj=a->j,*ai=a->i,*ib;
832: PetscInt nonzerorow=0;
835: VecCopy(yy,zz);
836: if (!a->nz) return(0);
837: VecGetArrayRead(xx,&x);
838: VecGetArray(zz,&z);
840: v = a->a;
841: xb = x;
843: for (i=0; i<mbs; i++) {
844: n = ai[1] - ai[0]; /* length of i_th block row of A */
845: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
846: ib = aj + *ai;
847: jmin = 0;
848: nonzerorow += (n>0);
849: if (n && *ib == i) { /* (diag of A)*x */
850: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
851: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
852: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
853: v += 9; jmin++;
854: }
855: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857: for (j=jmin; j<n; j++) {
858: /* (strict lower triangular part of A)*x */
859: cval = ib[j]*3;
860: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
861: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
862: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
863: /* (strict upper triangular part of A)*x */
864: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
865: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
866: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
867: v += 9;
868: }
869: xb +=3; ai++;
870: }
872: VecRestoreArrayRead(xx,&x);
873: VecRestoreArray(zz,&z);
875: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
876: return(0);
877: }
879: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
880: {
881: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
882: PetscScalar *z,x1,x2,x3,x4;
883: const PetscScalar *x,*xb;
884: const MatScalar *v;
885: PetscErrorCode ierr;
886: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
887: const PetscInt *aj=a->j,*ai=a->i,*ib;
888: PetscInt nonzerorow=0;
891: VecCopy(yy,zz);
892: if (!a->nz) return(0);
893: VecGetArrayRead(xx,&x);
894: VecGetArray(zz,&z);
896: v = a->a;
897: xb = x;
899: for (i=0; i<mbs; i++) {
900: n = ai[1] - ai[0]; /* length of i_th block row of A */
901: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
902: ib = aj + *ai;
903: jmin = 0;
904: nonzerorow += (n>0);
905: if (n && *ib == i) { /* (diag of A)*x */
906: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
907: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
908: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
909: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
910: v += 16; jmin++;
911: }
912: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
913: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
914: for (j=jmin; j<n; j++) {
915: /* (strict lower triangular part of A)*x */
916: cval = ib[j]*4;
917: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
918: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
919: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
920: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
921: /* (strict upper triangular part of A)*x */
922: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
923: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
924: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
925: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
926: v += 16;
927: }
928: xb +=4; ai++;
929: }
931: VecRestoreArrayRead(xx,&x);
932: VecRestoreArray(zz,&z);
934: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
935: return(0);
936: }
938: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
939: {
940: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
941: PetscScalar *z,x1,x2,x3,x4,x5;
942: const PetscScalar *x,*xb;
943: const MatScalar *v;
944: PetscErrorCode ierr;
945: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
946: const PetscInt *aj=a->j,*ai=a->i,*ib;
947: PetscInt nonzerorow=0;
950: VecCopy(yy,zz);
951: if (!a->nz) return(0);
952: VecGetArrayRead(xx,&x);
953: VecGetArray(zz,&z);
955: v = a->a;
956: xb = x;
958: for (i=0; i<mbs; i++) {
959: n = ai[1] - ai[0]; /* length of i_th block row of A */
960: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
961: ib = aj + *ai;
962: jmin = 0;
963: nonzerorow += (n>0);
964: if (n && *ib == i) { /* (diag of A)*x */
965: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
966: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
967: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
968: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
969: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
970: v += 25; jmin++;
971: }
972: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
973: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
974: for (j=jmin; j<n; j++) {
975: /* (strict lower triangular part of A)*x */
976: cval = ib[j]*5;
977: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
978: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
979: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
980: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
981: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
982: /* (strict upper triangular part of A)*x */
983: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
984: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
985: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
986: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
987: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
988: v += 25;
989: }
990: xb +=5; ai++;
991: }
993: VecRestoreArrayRead(xx,&x);
994: VecRestoreArray(zz,&z);
996: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
997: return(0);
998: }
1000: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1001: {
1002: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1003: PetscScalar *z,x1,x2,x3,x4,x5,x6;
1004: const PetscScalar *x,*xb;
1005: const MatScalar *v;
1006: PetscErrorCode ierr;
1007: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1008: const PetscInt *aj=a->j,*ai=a->i,*ib;
1009: PetscInt nonzerorow=0;
1012: VecCopy(yy,zz);
1013: if (!a->nz) return(0);
1014: VecGetArrayRead(xx,&x);
1015: VecGetArray(zz,&z);
1017: v = a->a;
1018: xb = x;
1020: for (i=0; i<mbs; i++) {
1021: n = ai[1] - ai[0]; /* length of i_th block row of A */
1022: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1023: ib = aj + *ai;
1024: jmin = 0;
1025: nonzerorow += (n>0);
1026: if (n && *ib == i) { /* (diag of A)*x */
1027: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1028: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1029: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1030: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1031: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1032: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1033: v += 36; jmin++;
1034: }
1035: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1036: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1037: for (j=jmin; j<n; j++) {
1038: /* (strict lower triangular part of A)*x */
1039: cval = ib[j]*6;
1040: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1041: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1042: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1043: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1044: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1045: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1046: /* (strict upper triangular part of A)*x */
1047: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1048: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1049: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1050: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1051: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1052: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1053: v += 36;
1054: }
1055: xb +=6; ai++;
1056: }
1058: VecRestoreArrayRead(xx,&x);
1059: VecRestoreArray(zz,&z);
1061: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1062: return(0);
1063: }
1065: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1066: {
1067: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1068: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1069: const PetscScalar *x,*xb;
1070: const MatScalar *v;
1071: PetscErrorCode ierr;
1072: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1073: const PetscInt *aj=a->j,*ai=a->i,*ib;
1074: PetscInt nonzerorow=0;
1077: VecCopy(yy,zz);
1078: if (!a->nz) return(0);
1079: VecGetArrayRead(xx,&x);
1080: VecGetArray(zz,&z);
1082: v = a->a;
1083: xb = x;
1085: for (i=0; i<mbs; i++) {
1086: n = ai[1] - ai[0]; /* length of i_th block row of A */
1087: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1088: ib = aj + *ai;
1089: jmin = 0;
1090: nonzerorow += (n>0);
1091: if (n && *ib == i) { /* (diag of A)*x */
1092: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1093: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1094: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1095: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1096: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1097: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1098: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1099: v += 49; jmin++;
1100: }
1101: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103: for (j=jmin; j<n; j++) {
1104: /* (strict lower triangular part of A)*x */
1105: cval = ib[j]*7;
1106: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1107: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1108: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1109: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1110: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1111: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1112: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1113: /* (strict upper triangular part of A)*x */
1114: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1115: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1116: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1117: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1118: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1119: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1120: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1121: v += 49;
1122: }
1123: xb +=7; ai++;
1124: }
1126: VecRestoreArrayRead(xx,&x);
1127: VecRestoreArray(zz,&z);
1129: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1130: return(0);
1131: }
1133: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1134: {
1135: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1136: PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt;
1137: const PetscScalar *x,*x_ptr,*xb;
1138: const MatScalar *v;
1139: PetscErrorCode ierr;
1140: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1141: const PetscInt *idx,*aj,*ii;
1142: PetscInt nonzerorow=0;
1145: VecCopy(yy,zz);
1146: if (!a->nz) return(0);
1147: VecGetArrayRead(xx,&x); x_ptr=x;
1148: VecGetArray(zz,&z); z_ptr=z;
1150: aj = a->j;
1151: v = a->a;
1152: ii = a->i;
1154: if (!a->mult_work) {
1155: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1156: }
1157: work = a->mult_work;
1160: for (i=0; i<mbs; i++) {
1161: n = ii[1] - ii[0]; ncols = n*bs;
1162: workt = work; idx=aj+ii[0];
1163: nonzerorow += (n>0);
1165: /* upper triangular part */
1166: for (j=0; j<n; j++) {
1167: xb = x_ptr + bs*(*idx++);
1168: for (k=0; k<bs; k++) workt[k] = xb[k];
1169: workt += bs;
1170: }
1171: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1172: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1174: /* strict lower triangular part */
1175: idx = aj+ii[0];
1176: if (n && *idx == i) {
1177: ncols -= bs; v += bs2; idx++; n--;
1178: }
1179: if (ncols > 0) {
1180: workt = work;
1181: PetscArrayzero(workt,ncols);
1182: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1183: for (j=0; j<n; j++) {
1184: zb = z_ptr + bs*(*idx++);
1185: for (k=0; k<bs; k++) zb[k] += workt[k];
1186: workt += bs;
1187: }
1188: }
1190: x += bs; v += n*bs2; z += bs; ii++;
1191: }
1193: VecRestoreArrayRead(xx,&x);
1194: VecRestoreArray(zz,&z);
1196: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1197: return(0);
1198: }
1200: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1201: {
1202: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1203: PetscScalar oalpha = alpha;
1205: PetscBLASInt one = 1,totalnz;
1208: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1209: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1210: PetscLogFlops(totalnz);
1211: return(0);
1212: }
1214: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1215: {
1216: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1217: const MatScalar *v = a->a;
1218: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1219: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1220: PetscErrorCode ierr;
1221: const PetscInt *aj=a->j,*col;
1224: if (!a->nz) {
1225: *norm = 0.0;
1226: return(0);
1227: }
1228: if (type == NORM_FROBENIUS) {
1229: for (k=0; k<mbs; k++) {
1230: jmin = a->i[k];
1231: jmax = a->i[k+1];
1232: col = aj + jmin;
1233: if (jmax-jmin > 0 && *col == k) { /* diagonal block */
1234: for (i=0; i<bs2; i++) {
1235: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1236: }
1237: jmin++;
1238: }
1239: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1240: for (i=0; i<bs2; i++) {
1241: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242: }
1243: }
1244: }
1245: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1246: PetscLogFlops(2.0*bs2*a->nz);
1247: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1248: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1249: for (i=0; i<mbs; i++) jl[i] = mbs;
1250: il[0] = 0;
1252: *norm = 0.0;
1253: for (k=0; k<mbs; k++) { /* k_th block row */
1254: for (j=0; j<bs; j++) sum[j]=0.0;
1255: /*-- col sum --*/
1256: i = jl[k]; /* first |A(i,k)| to be added */
1257: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1258: at step k */
1259: while (i<mbs) {
1260: nexti = jl[i]; /* next block row to be added */
1261: ik = il[i]; /* block index of A(i,k) in the array a */
1262: for (j=0; j<bs; j++) {
1263: v = a->a + ik*bs2 + j*bs;
1264: for (k1=0; k1<bs; k1++) {
1265: sum[j] += PetscAbsScalar(*v); v++;
1266: }
1267: }
1268: /* update il, jl */
1269: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1270: jmax = a->i[i+1];
1271: if (jmin < jmax) {
1272: il[i] = jmin;
1273: j = a->j[jmin];
1274: jl[i] = jl[j]; jl[j]=i;
1275: }
1276: i = nexti;
1277: }
1278: /*-- row sum --*/
1279: jmin = a->i[k];
1280: jmax = a->i[k+1];
1281: for (i=jmin; i<jmax; i++) {
1282: for (j=0; j<bs; j++) {
1283: v = a->a + i*bs2 + j;
1284: for (k1=0; k1<bs; k1++) {
1285: sum[j] += PetscAbsScalar(*v); v += bs;
1286: }
1287: }
1288: }
1289: /* add k_th block row to il, jl */
1290: col = aj+jmin;
1291: if (jmax - jmin > 0 && *col == k) jmin++;
1292: if (jmin < jmax) {
1293: il[k] = jmin;
1294: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1295: }
1296: for (j=0; j<bs; j++) {
1297: if (sum[j] > *norm) *norm = sum[j];
1298: }
1299: }
1300: PetscFree3(sum,il,jl);
1301: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1302: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1303: return(0);
1304: }
1306: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1307: {
1308: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1312: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1313: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1314: *flg = PETSC_FALSE;
1315: return(0);
1316: }
1318: /* if the a->i are the same */
1319: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1320: if (!*flg) return(0);
1322: /* if a->j are the same */
1323: PetscArraycmp(a->j,b->j,a->nz,flg);
1324: if (!*flg) return(0);
1326: /* if a->a are the same */
1327: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1328: return(0);
1329: }
1331: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1332: {
1333: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1334: PetscErrorCode ierr;
1335: PetscInt i,j,k,row,bs,ambs,bs2;
1336: const PetscInt *ai,*aj;
1337: PetscScalar *x,zero = 0.0;
1338: const MatScalar *aa,*aa_j;
1341: bs = A->rmap->bs;
1342: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1344: aa = a->a;
1345: ambs = a->mbs;
1347: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1348: PetscInt *diag=a->diag;
1349: aa = a->a;
1350: ambs = a->mbs;
1351: VecGetArray(v,&x);
1352: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1353: VecRestoreArray(v,&x);
1354: return(0);
1355: }
1357: ai = a->i;
1358: aj = a->j;
1359: bs2 = a->bs2;
1360: VecSet(v,zero);
1361: if (!a->nz) return(0);
1362: VecGetArray(v,&x);
1363: for (i=0; i<ambs; i++) {
1364: j = ai[i];
1365: if (aj[j] == i) { /* if this is a diagonal element */
1366: row = i*bs;
1367: aa_j = aa + j*bs2;
1368: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1369: }
1370: }
1371: VecRestoreArray(v,&x);
1372: return(0);
1373: }
1375: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1376: {
1377: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1378: PetscScalar x;
1379: const PetscScalar *l,*li,*ri;
1380: MatScalar *aa,*v;
1381: PetscErrorCode ierr;
1382: PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1383: const PetscInt *ai,*aj;
1384: PetscBool flg;
1387: if (ll != rr) {
1388: VecEqual(ll,rr,&flg);
1389: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1390: }
1391: if (!ll) return(0);
1392: ai = a->i;
1393: aj = a->j;
1394: aa = a->a;
1395: m = A->rmap->N;
1396: bs = A->rmap->bs;
1397: mbs = a->mbs;
1398: bs2 = a->bs2;
1400: VecGetArrayRead(ll,&l);
1401: VecGetLocalSize(ll,&lm);
1402: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1403: for (i=0; i<mbs; i++) { /* for each block row */
1404: M = ai[i+1] - ai[i];
1405: li = l + i*bs;
1406: v = aa + bs2*ai[i];
1407: for (j=0; j<M; j++) { /* for each block */
1408: ri = l + bs*aj[ai[i]+j];
1409: for (k=0; k<bs; k++) {
1410: x = ri[k];
1411: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1412: }
1413: }
1414: }
1415: VecRestoreArrayRead(ll,&l);
1416: PetscLogFlops(2.0*a->nz);
1417: return(0);
1418: }
1420: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1421: {
1422: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1425: info->block_size = a->bs2;
1426: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1427: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1428: info->nz_unneeded = info->nz_allocated - info->nz_used;
1429: info->assemblies = A->num_ass;
1430: info->mallocs = A->info.mallocs;
1431: info->memory = ((PetscObject)A)->mem;
1432: if (A->factortype) {
1433: info->fill_ratio_given = A->info.fill_ratio_given;
1434: info->fill_ratio_needed = A->info.fill_ratio_needed;
1435: info->factor_mallocs = A->info.factor_mallocs;
1436: } else {
1437: info->fill_ratio_given = 0;
1438: info->fill_ratio_needed = 0;
1439: info->factor_mallocs = 0;
1440: }
1441: return(0);
1442: }
1444: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1445: {
1446: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1450: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1451: return(0);
1452: }
1454: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1455: {
1456: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1457: PetscErrorCode ierr;
1458: PetscInt i,j,n,row,col,bs,mbs;
1459: const PetscInt *ai,*aj;
1460: PetscReal atmp;
1461: const MatScalar *aa;
1462: PetscScalar *x;
1463: PetscInt ncols,brow,bcol,krow,kcol;
1466: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1467: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1468: bs = A->rmap->bs;
1469: aa = a->a;
1470: ai = a->i;
1471: aj = a->j;
1472: mbs = a->mbs;
1474: VecSet(v,0.0);
1475: VecGetArray(v,&x);
1476: VecGetLocalSize(v,&n);
1477: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1478: for (i=0; i<mbs; i++) {
1479: ncols = ai[1] - ai[0]; ai++;
1480: brow = bs*i;
1481: for (j=0; j<ncols; j++) {
1482: bcol = bs*(*aj);
1483: for (kcol=0; kcol<bs; kcol++) {
1484: col = bcol + kcol; /* col index */
1485: for (krow=0; krow<bs; krow++) {
1486: atmp = PetscAbsScalar(*aa); aa++;
1487: row = brow + krow; /* row index */
1488: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1489: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1490: }
1491: }
1492: aj++;
1493: }
1494: }
1495: VecRestoreArray(v,&x);
1496: return(0);
1497: }
1499: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1500: {
1504: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1505: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1506: return(0);
1507: }
1509: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1510: {
1511: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1512: PetscScalar *z = c;
1513: const PetscScalar *xb;
1514: PetscScalar x1;
1515: const MatScalar *v = a->a,*vv;
1516: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1517: #if defined(PETSC_USE_COMPLEX)
1518: const int aconj = A->hermitian;
1519: #else
1520: const int aconj = 0;
1521: #endif
1524: for (i=0; i<mbs; i++) {
1525: n = ii[1] - ii[0]; ii++;
1526: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1527: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1528: jj = idx;
1529: vv = v;
1530: for (k=0; k<cn; k++) {
1531: idx = jj;
1532: v = vv;
1533: for (j=0; j<n; j++) {
1534: xb = b + (*idx); x1 = xb[0+k*bm];
1535: z[0+k*cm] += v[0]*x1;
1536: if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1537: v += 1;
1538: ++idx;
1539: }
1540: }
1541: z += 1;
1542: }
1543: return(0);
1544: }
1546: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1547: {
1548: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1549: PetscScalar *z = c;
1550: const PetscScalar *xb;
1551: PetscScalar x1,x2;
1552: const MatScalar *v = a->a,*vv;
1553: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1556: for (i=0; i<mbs; i++) {
1557: n = ii[1] - ii[0]; ii++;
1558: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1559: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1560: jj = idx;
1561: vv = v;
1562: for (k=0; k<cn; k++) {
1563: idx = jj;
1564: v = vv;
1565: for (j=0; j<n; j++) {
1566: xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1567: z[0+k*cm] += v[0]*x1 + v[2]*x2;
1568: z[1+k*cm] += v[1]*x1 + v[3]*x2;
1569: if (*idx != i) {
1570: c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1571: c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1572: }
1573: v += 4;
1574: ++idx;
1575: }
1576: }
1577: z += 2;
1578: }
1579: return(0);
1580: }
1582: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1583: {
1584: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1585: PetscScalar *z = c;
1586: const PetscScalar *xb;
1587: PetscScalar x1,x2,x3;
1588: const MatScalar *v = a->a,*vv;
1589: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1592: for (i=0; i<mbs; i++) {
1593: n = ii[1] - ii[0]; ii++;
1594: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1595: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1596: jj = idx;
1597: vv = v;
1598: for (k=0; k<cn; k++) {
1599: idx = jj;
1600: v = vv;
1601: for (j=0; j<n; j++) {
1602: xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1603: z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1604: z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1605: z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1606: if (*idx != i) {
1607: c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1608: c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1609: c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1610: }
1611: v += 9;
1612: ++idx;
1613: }
1614: }
1615: z += 3;
1616: }
1617: return(0);
1618: }
1620: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1621: {
1622: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1623: PetscScalar *z = c;
1624: const PetscScalar *xb;
1625: PetscScalar x1,x2,x3,x4;
1626: const MatScalar *v = a->a,*vv;
1627: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1630: for (i=0; i<mbs; i++) {
1631: n = ii[1] - ii[0]; ii++;
1632: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1633: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1634: jj = idx;
1635: vv = v;
1636: for (k=0; k<cn; k++) {
1637: idx = jj;
1638: v = vv;
1639: for (j=0; j<n; j++) {
1640: xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1641: z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1642: z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1643: z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1644: z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1645: if (*idx != i) {
1646: c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm];
1647: c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm];
1648: c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1649: c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1650: }
1651: v += 16;
1652: ++idx;
1653: }
1654: }
1655: z += 4;
1656: }
1657: return(0);
1658: }
1660: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1661: {
1662: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1663: PetscScalar *z = c;
1664: const PetscScalar *xb;
1665: PetscScalar x1,x2,x3,x4,x5;
1666: const MatScalar *v = a->a,*vv;
1667: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1670: for (i=0; i<mbs; i++) {
1671: n = ii[1] - ii[0]; ii++;
1672: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1673: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1674: jj = idx;
1675: vv = v;
1676: for (k=0; k<cn; k++) {
1677: idx = jj;
1678: v = vv;
1679: for (j=0; j<n; j++) {
1680: xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1681: z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1682: z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1683: z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1684: z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1685: z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1686: if (*idx != i) {
1687: c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1688: c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1689: c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1690: c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1691: c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1692: }
1693: v += 25;
1694: ++idx;
1695: }
1696: }
1697: z += 5;
1698: }
1699: return(0);
1700: }
1702: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1703: {
1704: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1705: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1706: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
1707: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1708: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1709: PetscBLASInt bbs,bcn,bbm,bcm;
1710: PetscScalar *z = NULL;
1711: PetscScalar *c,*b;
1712: const MatScalar *v;
1713: const PetscInt *idx,*ii;
1714: PetscScalar _DOne=1.0;
1715: PetscErrorCode ierr;
1718: if (!cm || !cn) return(0);
1719: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1720: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1721: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1722: b = bd->v;
1723: MatZeroEntries(C);
1724: MatDenseGetArray(C,&c);
1725: switch (bs) {
1726: case 1:
1727: MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1728: break;
1729: case 2:
1730: MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1731: break;
1732: case 3:
1733: MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1734: break;
1735: case 4:
1736: MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1737: break;
1738: case 5:
1739: MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1740: break;
1741: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1742: PetscBLASIntCast(bs,&bbs);
1743: PetscBLASIntCast(cn,&bcn);
1744: PetscBLASIntCast(bm,&bbm);
1745: PetscBLASIntCast(cm,&bcm);
1746: idx = a->j;
1747: v = a->a;
1748: mbs = a->mbs;
1749: ii = a->i;
1750: z = c;
1751: for (i=0; i<mbs; i++) {
1752: n = ii[1] - ii[0]; ii++;
1753: for (j=0; j<n; j++) {
1754: if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1755: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1756: v += bs2;
1757: }
1758: z += bs;
1759: }
1760: }
1761: MatDenseRestoreArray(C,&c);
1762: PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);
1763: return(0);
1764: }