Actual source code: sbaij2.c
petsc-3.13.6 2020-09-29
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
13: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14: const PetscInt *idx;
15: PetscBT table_out,table_in;
18: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19: mbs = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
23: PetscBTCreate(mbs,&table_out);
24: PetscMalloc1(mbs+1,&nidx);
25: PetscMalloc1(A->rmap->N+1,&nidx2);
26: PetscBTCreate(mbs,&table_in);
28: for (i=0; i<is_max; i++) { /* for each is */
29: isz = 0;
30: PetscBTMemzero(mbs,table_out);
32: /* Extract the indices, assume there can be duplicate entries */
33: ISGetIndices(is[i],&idx);
34: ISGetLocalSize(is[i],&n);
36: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37: bcol_max = 0;
38: for (j=0; j<n; ++j) {
39: brow = idx[j]/bs; /* convert the indices into block indices */
40: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41: if (!PetscBTLookupSet(table_out,brow)) {
42: nidx[isz++] = brow;
43: if (bcol_max < brow) bcol_max = brow;
44: }
45: }
46: ISRestoreIndices(is[i],&idx);
47: ISDestroy(&is[i]);
49: k = 0;
50: for (j=0; j<ov; j++) { /* for each overlap */
51: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52: PetscBTMemzero(mbs,table_in);
53: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
55: n = isz; /* length of the updated is[i] */
56: for (brow=0; brow<mbs; brow++) {
57: start = ai[brow]; end = ai[brow+1];
58: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59: for (l = start; l<end; l++) {
60: bcol = aj[l];
61: if (!PetscBTLookupSet(table_out,bcol)) {
62: nidx[isz++] = bcol;
63: if (bcol_max < bcol) bcol_max = bcol;
64: }
65: }
66: k++;
67: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69: for (l = start; l<end; l++) {
70: bcol = aj[l];
71: if (bcol > bcol_max) break;
72: if (PetscBTLookup(table_in,bcol)) {
73: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74: break; /* for l = start; l<end ; l++) */
75: }
76: }
77: }
78: }
79: } /* for each overlap */
81: /* expand the Index Set */
82: for (j=0; j<isz; j++) {
83: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84: }
85: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
86: }
87: PetscBTDestroy(&table_out);
88: PetscFree(nidx);
89: PetscFree(nidx2);
90: PetscBTDestroy(&table_in);
91: return(0);
92: }
94: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95: Zero some ops' to avoid invalid usse */
96: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97: {
101: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
102: Bseq->ops->mult = 0;
103: Bseq->ops->multadd = 0;
104: Bseq->ops->multtranspose = 0;
105: Bseq->ops->multtransposeadd = 0;
106: Bseq->ops->lufactor = 0;
107: Bseq->ops->choleskyfactor = 0;
108: Bseq->ops->lufactorsymbolic = 0;
109: Bseq->ops->choleskyfactorsymbolic = 0;
110: Bseq->ops->getinertia = 0;
111: return(0);
112: }
114: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
115: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116: {
117: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
119: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
121: const PetscInt *irow,*icol;
122: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123: PetscInt *aj = a->j,*ai = a->i;
124: MatScalar *mat_a;
125: Mat C;
126: PetscBool flag;
130: ISGetIndices(isrow,&irow);
131: ISGetIndices(iscol,&icol);
132: ISGetLocalSize(isrow,&nrows);
133: ISGetLocalSize(iscol,&ncols);
135: PetscCalloc1(1+oldcols,&smap);
136: ssmap = smap;
137: PetscMalloc1(1+nrows,&lens);
138: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139: /* determine lens of each row */
140: for (i=0; i<nrows; i++) {
141: kstart = ai[irow[i]];
142: kend = kstart + a->ilen[irow[i]];
143: lens[i] = 0;
144: for (k=kstart; k<kend; k++) {
145: if (ssmap[aj[k]]) lens[i]++;
146: }
147: }
148: /* Create and fill new matrix */
149: if (scall == MAT_REUSE_MATRIX) {
150: c = (Mat_SeqSBAIJ*)((*B)->data);
152: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
154: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155: PetscArrayzero(c->ilen,c->mbs);
156: C = *B;
157: } else {
158: MatCreate(PetscObjectComm((PetscObject)A),&C);
159: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
160: MatSetType(C,((PetscObject)A)->type_name);
161: MatSeqSBAIJSetPreallocation(C,bs,0,lens);
162: }
163: c = (Mat_SeqSBAIJ*)(C->data);
164: for (i=0; i<nrows; i++) {
165: row = irow[i];
166: kstart = ai[row];
167: kend = kstart + a->ilen[row];
168: mat_i = c->i[i];
169: mat_j = c->j + mat_i;
170: mat_a = c->a + mat_i*bs2;
171: mat_ilen = c->ilen + i;
172: for (k=kstart; k<kend; k++) {
173: if ((tcol=ssmap[a->j[k]])) {
174: *mat_j++ = tcol - 1;
175: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
176: mat_a += bs2;
177: (*mat_ilen)++;
178: }
179: }
180: }
181: /* sort */
182: {
183: MatScalar *work;
185: PetscMalloc1(bs2,&work);
186: for (i=0; i<nrows; i++) {
187: PetscInt ilen;
188: mat_i = c->i[i];
189: mat_j = c->j + mat_i;
190: mat_a = c->a + mat_i*bs2;
191: ilen = c->ilen[i];
192: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
193: }
194: PetscFree(work);
195: }
197: /* Free work space */
198: ISRestoreIndices(iscol,&icol);
199: PetscFree(smap);
200: PetscFree(lens);
201: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
202: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
204: ISRestoreIndices(isrow,&irow);
205: *B = C;
206: return(0);
207: }
209: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210: {
211: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
212: IS is1,is2;
214: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215: const PetscInt *irow,*icol;
218: ISGetIndices(isrow,&irow);
219: ISGetIndices(iscol,&icol);
220: ISGetLocalSize(isrow,&nrows);
221: ISGetLocalSize(iscol,&ncols);
223: /* Verify if the indices corespond to each element in a block
224: and form the IS with compressed IS */
225: maxmnbs = PetscMax(a->mbs,a->nbs);
226: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
227: PetscArrayzero(vary,a->mbs);
228: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229: for (i=0; i<a->mbs; i++) {
230: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231: }
232: count = 0;
233: for (i=0; i<nrows; i++) {
234: PetscInt j = irow[i] / bs;
235: if ((vary[j]--)==bs) iary[count++] = j;
236: }
237: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
239: PetscArrayzero(vary,a->nbs);
240: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241: for (i=0; i<a->nbs; i++) {
242: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243: }
244: count = 0;
245: for (i=0; i<ncols; i++) {
246: PetscInt j = icol[i] / bs;
247: if ((vary[j]--)==bs) iary[count++] = j;
248: }
249: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
250: ISRestoreIndices(isrow,&irow);
251: ISRestoreIndices(iscol,&icol);
252: PetscFree2(vary,iary);
254: MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
255: ISDestroy(&is1);
256: ISDestroy(&is2);
258: if (isrow != iscol) {
259: PetscBool isequal;
260: ISEqual(isrow,iscol,&isequal);
261: if (!isequal) {
262: MatSeqSBAIJZeroOps_Private(*B);
263: }
264: }
265: return(0);
266: }
268: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269: {
271: PetscInt i;
274: if (scall == MAT_INITIAL_MATRIX) {
275: PetscCalloc1(n+1,B);
276: }
278: for (i=0; i<n; i++) {
279: MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
280: }
281: return(0);
282: }
284: /* -------------------------------------------------------*/
285: /* Should check that shapes of vectors and matrices match */
286: /* -------------------------------------------------------*/
288: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289: {
290: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
291: PetscScalar *z,x1,x2,zero=0.0;
292: const PetscScalar *x,*xb;
293: const MatScalar *v;
294: PetscErrorCode ierr;
295: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
296: const PetscInt *aj=a->j,*ai=a->i,*ib;
297: PetscInt nonzerorow=0;
300: VecSet(zz,zero);
301: if (!a->nz) return(0);
302: VecGetArrayRead(xx,&x);
303: VecGetArray(zz,&z);
305: v = a->a;
306: xb = x;
308: for (i=0; i<mbs; i++) {
309: n = ai[1] - ai[0]; /* length of i_th block row of A */
310: x1 = xb[0]; x2 = xb[1];
311: ib = aj + *ai;
312: jmin = 0;
313: nonzerorow += (n>0);
314: if (*ib == i) { /* (diag of A)*x */
315: z[2*i] += v[0]*x1 + v[2]*x2;
316: z[2*i+1] += v[2]*x1 + v[3]*x2;
317: v += 4; jmin++;
318: }
319: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321: for (j=jmin; j<n; j++) {
322: /* (strict lower triangular part of A)*x */
323: cval = ib[j]*2;
324: z[cval] += v[0]*x1 + v[1]*x2;
325: z[cval+1] += v[2]*x1 + v[3]*x2;
326: /* (strict upper triangular part of A)*x */
327: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
328: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329: v += 4;
330: }
331: xb +=2; ai++;
332: }
334: VecRestoreArrayRead(xx,&x);
335: VecRestoreArray(zz,&z);
336: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
337: return(0);
338: }
340: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341: {
342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
343: PetscScalar *z,x1,x2,x3,zero=0.0;
344: const PetscScalar *x,*xb;
345: const MatScalar *v;
346: PetscErrorCode ierr;
347: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
348: const PetscInt *aj = a->j,*ai = a->i,*ib;
349: PetscInt nonzerorow=0;
352: VecSet(zz,zero);
353: if (!a->nz) return(0);
354: VecGetArrayRead(xx,&x);
355: VecGetArray(zz,&z);
357: v = a->a;
358: xb = x;
360: for (i=0; i<mbs; i++) {
361: n = ai[1] - ai[0]; /* length of i_th block row of A */
362: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
363: ib = aj + *ai;
364: jmin = 0;
365: nonzerorow += (n>0);
366: if (*ib == i) { /* (diag of A)*x */
367: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
368: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370: v += 9; jmin++;
371: }
372: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
374: for (j=jmin; j<n; j++) {
375: /* (strict lower triangular part of A)*x */
376: cval = ib[j]*3;
377: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
378: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380: /* (strict upper triangular part of A)*x */
381: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384: v += 9;
385: }
386: xb +=3; ai++;
387: }
389: VecRestoreArrayRead(xx,&x);
390: VecRestoreArray(zz,&z);
391: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
392: return(0);
393: }
395: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396: {
397: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
398: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
399: const PetscScalar *x,*xb;
400: const MatScalar *v;
401: PetscErrorCode ierr;
402: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
403: const PetscInt *aj = a->j,*ai = a->i,*ib;
404: PetscInt nonzerorow = 0;
407: VecSet(zz,zero);
408: if (!a->nz) return(0);
409: VecGetArrayRead(xx,&x);
410: VecGetArray(zz,&z);
412: v = a->a;
413: xb = x;
415: for (i=0; i<mbs; i++) {
416: n = ai[1] - ai[0]; /* length of i_th block row of A */
417: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418: ib = aj + *ai;
419: jmin = 0;
420: nonzerorow += (n>0);
421: if (*ib == i) { /* (diag of A)*x */
422: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
423: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
424: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426: v += 16; jmin++;
427: }
428: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430: for (j=jmin; j<n; j++) {
431: /* (strict lower triangular part of A)*x */
432: cval = ib[j]*4;
433: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437: /* (strict upper triangular part of A)*x */
438: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442: v += 16;
443: }
444: xb +=4; ai++;
445: }
447: VecRestoreArrayRead(xx,&x);
448: VecRestoreArray(zz,&z);
449: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
450: return(0);
451: }
453: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454: {
455: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
456: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
457: const PetscScalar *x,*xb;
458: const MatScalar *v;
459: PetscErrorCode ierr;
460: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
461: const PetscInt *aj = a->j,*ai = a->i,*ib;
462: PetscInt nonzerorow=0;
465: VecSet(zz,zero);
466: if (!a->nz) return(0);
467: VecGetArrayRead(xx,&x);
468: VecGetArray(zz,&z);
470: v = a->a;
471: xb = x;
473: for (i=0; i<mbs; i++) {
474: n = ai[1] - ai[0]; /* length of i_th block row of A */
475: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476: ib = aj + *ai;
477: jmin = 0;
478: nonzerorow += (n>0);
479: if (*ib == i) { /* (diag of A)*x */
480: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485: v += 25; jmin++;
486: }
487: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489: for (j=jmin; j<n; j++) {
490: /* (strict lower triangular part of A)*x */
491: cval = ib[j]*5;
492: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497: /* (strict upper triangular part of A)*x */
498: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503: v += 25;
504: }
505: xb +=5; ai++;
506: }
508: VecRestoreArrayRead(xx,&x);
509: VecRestoreArray(zz,&z);
510: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
511: return(0);
512: }
514: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
517: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518: const PetscScalar *x,*xb;
519: const MatScalar *v;
520: PetscErrorCode ierr;
521: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
522: const PetscInt *aj=a->j,*ai=a->i,*ib;
523: PetscInt nonzerorow=0;
526: VecSet(zz,zero);
527: if (!a->nz) return(0);
528: VecGetArrayRead(xx,&x);
529: VecGetArray(zz,&z);
531: v = a->a;
532: xb = x;
534: for (i=0; i<mbs; i++) {
535: n = ai[1] - ai[0]; /* length of i_th block row of A */
536: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537: ib = aj + *ai;
538: jmin = 0;
539: nonzerorow += (n>0);
540: if (*ib == i) { /* (diag of A)*x */
541: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547: v += 36; jmin++;
548: }
549: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551: for (j=jmin; j<n; j++) {
552: /* (strict lower triangular part of A)*x */
553: cval = ib[j]*6;
554: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560: /* (strict upper triangular part of A)*x */
561: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567: v += 36;
568: }
569: xb +=6; ai++;
570: }
572: VecRestoreArrayRead(xx,&x);
573: VecRestoreArray(zz,&z);
574: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
575: return(0);
576: }
578: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579: {
580: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
581: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582: const PetscScalar *x,*xb;
583: const MatScalar *v;
584: PetscErrorCode ierr;
585: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
586: const PetscInt *aj=a->j,*ai=a->i,*ib;
587: PetscInt nonzerorow=0;
590: VecSet(zz,zero);
591: if (!a->nz) return(0);
592: VecGetArrayRead(xx,&x);
593: VecGetArray(zz,&z);
595: v = a->a;
596: xb = x;
598: for (i=0; i<mbs; i++) {
599: n = ai[1] - ai[0]; /* length of i_th block row of A */
600: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601: ib = aj + *ai;
602: jmin = 0;
603: nonzerorow += (n>0);
604: if (*ib == i) { /* (diag of A)*x */
605: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612: v += 49; jmin++;
613: }
614: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616: for (j=jmin; j<n; j++) {
617: /* (strict lower triangular part of A)*x */
618: cval = ib[j]*7;
619: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626: /* (strict upper triangular part of A)*x */
627: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634: v += 49;
635: }
636: xb +=7; ai++;
637: }
638: VecRestoreArrayRead(xx,&x);
639: VecRestoreArray(zz,&z);
640: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
641: return(0);
642: }
644: /*
645: This will not work with MatScalar == float because it calls the BLAS
646: */
647: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648: {
649: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
650: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651: const PetscScalar *x,*x_ptr,*xb;
652: const MatScalar *v;
653: PetscErrorCode ierr;
654: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655: const PetscInt *idx,*aj,*ii;
656: PetscInt nonzerorow=0;
659: VecSet(zz,zero);
660: if (!a->nz) return(0);
661: VecGetArrayRead(xx,&x);x_ptr = x;
662: VecGetArray(zz,&z); z_ptr=z;
664: aj = a->j;
665: v = a->a;
666: ii = a->i;
668: if (!a->mult_work) {
669: PetscMalloc1(A->rmap->N+1,&a->mult_work);
670: }
671: work = a->mult_work;
673: for (i=0; i<mbs; i++) {
674: n = ii[1] - ii[0]; ncols = n*bs;
675: workt = work; idx=aj+ii[0];
676: nonzerorow += (n>0);
678: /* upper triangular part */
679: for (j=0; j<n; j++) {
680: xb = x_ptr + bs*(*idx++);
681: for (k=0; k<bs; k++) workt[k] = xb[k];
682: workt += bs;
683: }
684: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
685: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
687: /* strict lower triangular part */
688: idx = aj+ii[0];
689: if (*idx == i) {
690: ncols -= bs; v += bs2; idx++; n--;
691: }
693: if (ncols > 0) {
694: workt = work;
695: PetscArrayzero(workt,ncols);
696: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
697: for (j=0; j<n; j++) {
698: zb = z_ptr + bs*(*idx++);
699: for (k=0; k<bs; k++) zb[k] += workt[k];
700: workt += bs;
701: }
702: }
703: x += bs; v += n*bs2; z += bs; ii++;
704: }
706: VecRestoreArrayRead(xx,&x);
707: VecRestoreArray(zz,&z);
708: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
709: return(0);
710: }
712: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
713: {
714: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
715: PetscScalar *z,x1;
716: const PetscScalar *x,*xb;
717: const MatScalar *v;
718: PetscErrorCode ierr;
719: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
720: const PetscInt *aj=a->j,*ai=a->i,*ib;
721: PetscInt nonzerorow=0;
722: #if defined(PETSC_USE_COMPLEX)
723: const int aconj = A->hermitian;
724: #else
725: const int aconj = 0;
726: #endif
729: VecCopy(yy,zz);
730: if (!a->nz) return(0);
731: VecGetArrayRead(xx,&x);
732: VecGetArray(zz,&z);
733: v = a->a;
734: xb = x;
736: for (i=0; i<mbs; i++) {
737: n = ai[1] - ai[0]; /* length of i_th row of A */
738: x1 = xb[0];
739: ib = aj + *ai;
740: jmin = 0;
741: nonzerorow += (n>0);
742: if (n && *ib == i) { /* (diag of A)*x */
743: z[i] += *v++ * x[*ib++]; jmin++;
744: }
745: if (aconj) {
746: for (j=jmin; j<n; j++) {
747: cval = *ib;
748: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
749: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
750: }
751: } else {
752: for (j=jmin; j<n; j++) {
753: cval = *ib;
754: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
755: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
756: }
757: }
758: xb++; ai++;
759: }
761: VecRestoreArrayRead(xx,&x);
762: VecRestoreArray(zz,&z);
764: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
765: return(0);
766: }
768: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
769: {
770: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
771: PetscScalar *z,x1,x2;
772: const PetscScalar *x,*xb;
773: const MatScalar *v;
774: PetscErrorCode ierr;
775: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
776: const PetscInt *aj=a->j,*ai=a->i,*ib;
777: PetscInt nonzerorow=0;
780: VecCopy(yy,zz);
781: if (!a->nz) return(0);
782: VecGetArrayRead(xx,&x);
783: VecGetArray(zz,&z);
785: v = a->a;
786: xb = x;
788: for (i=0; i<mbs; i++) {
789: n = ai[1] - ai[0]; /* length of i_th block row of A */
790: x1 = xb[0]; x2 = xb[1];
791: ib = aj + *ai;
792: jmin = 0;
793: nonzerorow += (n>0);
794: if (*ib == i) { /* (diag of A)*x */
795: z[2*i] += v[0]*x1 + v[2]*x2;
796: z[2*i+1] += v[2]*x1 + v[3]*x2;
797: v += 4; jmin++;
798: }
799: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
801: for (j=jmin; j<n; j++) {
802: /* (strict lower triangular part of A)*x */
803: cval = ib[j]*2;
804: z[cval] += v[0]*x1 + v[1]*x2;
805: z[cval+1] += v[2]*x1 + v[3]*x2;
806: /* (strict upper triangular part of A)*x */
807: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
808: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
809: v += 4;
810: }
811: xb +=2; ai++;
812: }
813: VecRestoreArrayRead(xx,&x);
814: VecRestoreArray(zz,&z);
816: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
817: return(0);
818: }
820: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
821: {
822: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
823: PetscScalar *z,x1,x2,x3;
824: const PetscScalar *x,*xb;
825: const MatScalar *v;
826: PetscErrorCode ierr;
827: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
828: const PetscInt *aj=a->j,*ai=a->i,*ib;
829: PetscInt nonzerorow=0;
832: VecCopy(yy,zz);
833: if (!a->nz) return(0);
834: VecGetArrayRead(xx,&x);
835: VecGetArray(zz,&z);
837: v = a->a;
838: xb = x;
840: for (i=0; i<mbs; i++) {
841: n = ai[1] - ai[0]; /* length of i_th block row of A */
842: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
843: ib = aj + *ai;
844: jmin = 0;
845: nonzerorow += (n>0);
846: if (*ib == i) { /* (diag of A)*x */
847: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
848: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
849: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
850: v += 9; jmin++;
851: }
852: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
853: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
854: for (j=jmin; j<n; j++) {
855: /* (strict lower triangular part of A)*x */
856: cval = ib[j]*3;
857: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
858: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
859: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
860: /* (strict upper triangular part of A)*x */
861: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
862: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
863: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
864: v += 9;
865: }
866: xb +=3; ai++;
867: }
869: VecRestoreArrayRead(xx,&x);
870: VecRestoreArray(zz,&z);
872: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
873: return(0);
874: }
876: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
877: {
878: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
879: PetscScalar *z,x1,x2,x3,x4;
880: const PetscScalar *x,*xb;
881: const MatScalar *v;
882: PetscErrorCode ierr;
883: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
884: const PetscInt *aj=a->j,*ai=a->i,*ib;
885: PetscInt nonzerorow=0;
888: VecCopy(yy,zz);
889: if (!a->nz) return(0);
890: VecGetArrayRead(xx,&x);
891: VecGetArray(zz,&z);
893: v = a->a;
894: xb = x;
896: for (i=0; i<mbs; i++) {
897: n = ai[1] - ai[0]; /* length of i_th block row of A */
898: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
899: ib = aj + *ai;
900: jmin = 0;
901: nonzerorow += (n>0);
902: if (*ib == i) { /* (diag of A)*x */
903: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
904: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
905: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
906: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
907: v += 16; jmin++;
908: }
909: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
910: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
911: for (j=jmin; j<n; j++) {
912: /* (strict lower triangular part of A)*x */
913: cval = ib[j]*4;
914: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
915: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
916: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
917: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
918: /* (strict upper triangular part of A)*x */
919: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
920: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
921: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
922: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
923: v += 16;
924: }
925: xb +=4; ai++;
926: }
928: VecRestoreArrayRead(xx,&x);
929: VecRestoreArray(zz,&z);
931: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
932: return(0);
933: }
935: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
936: {
937: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
938: PetscScalar *z,x1,x2,x3,x4,x5;
939: const PetscScalar *x,*xb;
940: const MatScalar *v;
941: PetscErrorCode ierr;
942: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
943: const PetscInt *aj=a->j,*ai=a->i,*ib;
944: PetscInt nonzerorow=0;
947: VecCopy(yy,zz);
948: if (!a->nz) return(0);
949: VecGetArrayRead(xx,&x);
950: VecGetArray(zz,&z);
952: v = a->a;
953: xb = x;
955: for (i=0; i<mbs; i++) {
956: n = ai[1] - ai[0]; /* length of i_th block row of A */
957: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
958: ib = aj + *ai;
959: jmin = 0;
960: nonzerorow += (n>0);
961: if (*ib == i) { /* (diag of A)*x */
962: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
963: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
964: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
965: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
966: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
967: v += 25; jmin++;
968: }
969: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
970: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971: for (j=jmin; j<n; j++) {
972: /* (strict lower triangular part of A)*x */
973: cval = ib[j]*5;
974: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
975: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
976: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
977: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
978: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
979: /* (strict upper triangular part of A)*x */
980: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
981: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
982: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
983: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
984: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
985: v += 25;
986: }
987: xb +=5; ai++;
988: }
990: VecRestoreArrayRead(xx,&x);
991: VecRestoreArray(zz,&z);
993: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
994: return(0);
995: }
997: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
998: {
999: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1000: PetscScalar *z,x1,x2,x3,x4,x5,x6;
1001: const PetscScalar *x,*xb;
1002: const MatScalar *v;
1003: PetscErrorCode ierr;
1004: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1005: const PetscInt *aj=a->j,*ai=a->i,*ib;
1006: PetscInt nonzerorow=0;
1009: VecCopy(yy,zz);
1010: if (!a->nz) return(0);
1011: VecGetArrayRead(xx,&x);
1012: VecGetArray(zz,&z);
1014: v = a->a;
1015: xb = x;
1017: for (i=0; i<mbs; i++) {
1018: n = ai[1] - ai[0]; /* length of i_th block row of A */
1019: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1020: ib = aj + *ai;
1021: jmin = 0;
1022: nonzerorow += (n>0);
1023: if (*ib == i) { /* (diag of A)*x */
1024: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1025: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1026: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1027: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1028: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1029: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1030: v += 36; jmin++;
1031: }
1032: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1033: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034: for (j=jmin; j<n; j++) {
1035: /* (strict lower triangular part of A)*x */
1036: cval = ib[j]*6;
1037: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1038: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1039: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1040: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1041: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1042: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1043: /* (strict upper triangular part of A)*x */
1044: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1045: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1046: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1047: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1048: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1049: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1050: v += 36;
1051: }
1052: xb +=6; ai++;
1053: }
1055: VecRestoreArrayRead(xx,&x);
1056: VecRestoreArray(zz,&z);
1058: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1059: return(0);
1060: }
1062: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1063: {
1064: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1065: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1066: const PetscScalar *x,*xb;
1067: const MatScalar *v;
1068: PetscErrorCode ierr;
1069: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1070: const PetscInt *aj=a->j,*ai=a->i,*ib;
1071: PetscInt nonzerorow=0;
1074: VecCopy(yy,zz);
1075: if (!a->nz) return(0);
1076: VecGetArrayRead(xx,&x);
1077: VecGetArray(zz,&z);
1079: v = a->a;
1080: xb = x;
1082: for (i=0; i<mbs; i++) {
1083: n = ai[1] - ai[0]; /* length of i_th block row of A */
1084: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1085: ib = aj + *ai;
1086: jmin = 0;
1087: nonzerorow += (n>0);
1088: if (*ib == i) { /* (diag of A)*x */
1089: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1090: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1091: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1092: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1093: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1094: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1095: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1096: v += 49; jmin++;
1097: }
1098: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1099: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1100: for (j=jmin; j<n; j++) {
1101: /* (strict lower triangular part of A)*x */
1102: cval = ib[j]*7;
1103: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1104: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1105: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1106: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1107: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1108: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1109: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1110: /* (strict upper triangular part of A)*x */
1111: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1112: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1113: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1114: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1115: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1116: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1117: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1118: v += 49;
1119: }
1120: xb +=7; ai++;
1121: }
1123: VecRestoreArrayRead(xx,&x);
1124: VecRestoreArray(zz,&z);
1126: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1127: return(0);
1128: }
1130: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1131: {
1132: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1133: PetscScalar *z,*z_ptr=0,*zb,*work,*workt;
1134: const PetscScalar *x,*x_ptr,*xb;
1135: const MatScalar *v;
1136: PetscErrorCode ierr;
1137: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1138: const PetscInt *idx,*aj,*ii;
1139: PetscInt nonzerorow=0;
1142: VecCopy(yy,zz);
1143: if (!a->nz) return(0);
1144: VecGetArrayRead(xx,&x); x_ptr=x;
1145: VecGetArray(zz,&z); z_ptr=z;
1147: aj = a->j;
1148: v = a->a;
1149: ii = a->i;
1151: if (!a->mult_work) {
1152: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1153: }
1154: work = a->mult_work;
1157: for (i=0; i<mbs; i++) {
1158: n = ii[1] - ii[0]; ncols = n*bs;
1159: workt = work; idx=aj+ii[0];
1160: nonzerorow += (n>0);
1162: /* upper triangular part */
1163: for (j=0; j<n; j++) {
1164: xb = x_ptr + bs*(*idx++);
1165: for (k=0; k<bs; k++) workt[k] = xb[k];
1166: workt += bs;
1167: }
1168: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1169: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1171: /* strict lower triangular part */
1172: idx = aj+ii[0];
1173: if (*idx == i) {
1174: ncols -= bs; v += bs2; idx++; n--;
1175: }
1176: if (ncols > 0) {
1177: workt = work;
1178: PetscArrayzero(workt,ncols);
1179: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1180: for (j=0; j<n; j++) {
1181: zb = z_ptr + bs*(*idx++);
1182: for (k=0; k<bs; k++) zb[k] += workt[k];
1183: workt += bs;
1184: }
1185: }
1187: x += bs; v += n*bs2; z += bs; ii++;
1188: }
1190: VecRestoreArrayRead(xx,&x);
1191: VecRestoreArray(zz,&z);
1193: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1194: return(0);
1195: }
1197: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1198: {
1199: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1200: PetscScalar oalpha = alpha;
1202: PetscBLASInt one = 1,totalnz;
1205: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1206: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1207: PetscLogFlops(totalnz);
1208: return(0);
1209: }
1211: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1212: {
1213: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1214: const MatScalar *v = a->a;
1215: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1216: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1217: PetscErrorCode ierr;
1218: const PetscInt *aj=a->j,*col;
1221: if (!a->nz) {
1222: *norm = 0.0;
1223: return(0);
1224: }
1225: if (type == NORM_FROBENIUS) {
1226: for (k=0; k<mbs; k++) {
1227: jmin = a->i[k]; jmax = a->i[k+1];
1228: col = aj + jmin;
1229: if (*col == k) { /* diagonal block */
1230: for (i=0; i<bs2; i++) {
1231: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1232: }
1233: jmin++;
1234: }
1235: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1236: for (i=0; i<bs2; i++) {
1237: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1238: }
1239: }
1240: }
1241: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1242: PetscLogFlops(2.0*bs2*a->nz);
1243: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1244: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1245: for (i=0; i<mbs; i++) jl[i] = mbs;
1246: il[0] = 0;
1248: *norm = 0.0;
1249: for (k=0; k<mbs; k++) { /* k_th block row */
1250: for (j=0; j<bs; j++) sum[j]=0.0;
1251: /*-- col sum --*/
1252: i = jl[k]; /* first |A(i,k)| to be added */
1253: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1254: at step k */
1255: while (i<mbs) {
1256: nexti = jl[i]; /* next block row to be added */
1257: ik = il[i]; /* block index of A(i,k) in the array a */
1258: for (j=0; j<bs; j++) {
1259: v = a->a + ik*bs2 + j*bs;
1260: for (k1=0; k1<bs; k1++) {
1261: sum[j] += PetscAbsScalar(*v); v++;
1262: }
1263: }
1264: /* update il, jl */
1265: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1266: jmax = a->i[i+1];
1267: if (jmin < jmax) {
1268: il[i] = jmin;
1269: j = a->j[jmin];
1270: jl[i] = jl[j]; jl[j]=i;
1271: }
1272: i = nexti;
1273: }
1274: /*-- row sum --*/
1275: jmin = a->i[k]; jmax = a->i[k+1];
1276: for (i=jmin; i<jmax; i++) {
1277: for (j=0; j<bs; j++) {
1278: v = a->a + i*bs2 + j;
1279: for (k1=0; k1<bs; k1++) {
1280: sum[j] += PetscAbsScalar(*v); v += bs;
1281: }
1282: }
1283: }
1284: /* add k_th block row to il, jl */
1285: col = aj+jmin;
1286: if (*col == k) jmin++;
1287: if (jmin < jmax) {
1288: il[k] = jmin;
1289: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1290: }
1291: for (j=0; j<bs; j++) {
1292: if (sum[j] > *norm) *norm = sum[j];
1293: }
1294: }
1295: PetscFree3(sum,il,jl);
1296: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1297: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1298: return(0);
1299: }
1301: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1302: {
1303: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1307: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1308: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1309: *flg = PETSC_FALSE;
1310: return(0);
1311: }
1313: /* if the a->i are the same */
1314: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1315: if (!*flg) return(0);
1317: /* if a->j are the same */
1318: PetscArraycmp(a->j,b->j,a->nz,flg);
1319: if (!*flg) return(0);
1321: /* if a->a are the same */
1322: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1323: return(0);
1324: }
1326: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1327: {
1328: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1329: PetscErrorCode ierr;
1330: PetscInt i,j,k,row,bs,ambs,bs2;
1331: const PetscInt *ai,*aj;
1332: PetscScalar *x,zero = 0.0;
1333: const MatScalar *aa,*aa_j;
1336: bs = A->rmap->bs;
1337: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1339: aa = a->a;
1340: ambs = a->mbs;
1342: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1343: PetscInt *diag=a->diag;
1344: aa = a->a;
1345: ambs = a->mbs;
1346: VecGetArray(v,&x);
1347: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1348: VecRestoreArray(v,&x);
1349: return(0);
1350: }
1352: ai = a->i;
1353: aj = a->j;
1354: bs2 = a->bs2;
1355: VecSet(v,zero);
1356: if (!a->nz) return(0);
1357: VecGetArray(v,&x);
1358: for (i=0; i<ambs; i++) {
1359: j=ai[i];
1360: if (aj[j] == i) { /* if this is a diagonal element */
1361: row = i*bs;
1362: aa_j = aa + j*bs2;
1363: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1364: }
1365: }
1366: VecRestoreArray(v,&x);
1367: return(0);
1368: }
1370: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1371: {
1372: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1373: PetscScalar x;
1374: const PetscScalar *l,*li,*ri;
1375: MatScalar *aa,*v;
1376: PetscErrorCode ierr;
1377: PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1378: const PetscInt *ai,*aj;
1379: PetscBool flg;
1382: if (ll != rr) {
1383: VecEqual(ll,rr,&flg);
1384: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1385: }
1386: if (!ll) return(0);
1387: ai = a->i;
1388: aj = a->j;
1389: aa = a->a;
1390: m = A->rmap->N;
1391: bs = A->rmap->bs;
1392: mbs = a->mbs;
1393: bs2 = a->bs2;
1395: VecGetArrayRead(ll,&l);
1396: VecGetLocalSize(ll,&lm);
1397: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1398: for (i=0; i<mbs; i++) { /* for each block row */
1399: M = ai[i+1] - ai[i];
1400: li = l + i*bs;
1401: v = aa + bs2*ai[i];
1402: for (j=0; j<M; j++) { /* for each block */
1403: ri = l + bs*aj[ai[i]+j];
1404: for (k=0; k<bs; k++) {
1405: x = ri[k];
1406: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1407: }
1408: }
1409: }
1410: VecRestoreArrayRead(ll,&l);
1411: PetscLogFlops(2.0*a->nz);
1412: return(0);
1413: }
1415: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1416: {
1417: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1420: info->block_size = a->bs2;
1421: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1422: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1423: info->nz_unneeded = info->nz_allocated - info->nz_used;
1424: info->assemblies = A->num_ass;
1425: info->mallocs = A->info.mallocs;
1426: info->memory = ((PetscObject)A)->mem;
1427: if (A->factortype) {
1428: info->fill_ratio_given = A->info.fill_ratio_given;
1429: info->fill_ratio_needed = A->info.fill_ratio_needed;
1430: info->factor_mallocs = A->info.factor_mallocs;
1431: } else {
1432: info->fill_ratio_given = 0;
1433: info->fill_ratio_needed = 0;
1434: info->factor_mallocs = 0;
1435: }
1436: return(0);
1437: }
1439: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1440: {
1441: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1445: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1446: return(0);
1447: }
1449: /*
1450: This code does not work since it only checks the upper triangular part of
1451: the matrix. Hence it is not listed in the function table.
1452: */
1453: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1454: {
1455: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1456: PetscErrorCode ierr;
1457: PetscInt i,j,n,row,col,bs,mbs;
1458: const PetscInt *ai,*aj;
1459: PetscReal atmp;
1460: const MatScalar *aa;
1461: PetscScalar *x;
1462: PetscInt ncols,brow,bcol,krow,kcol;
1465: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1466: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1467: bs = A->rmap->bs;
1468: aa = a->a;
1469: ai = a->i;
1470: aj = a->j;
1471: mbs = a->mbs;
1473: VecSet(v,0.0);
1474: VecGetArray(v,&x);
1475: VecGetLocalSize(v,&n);
1476: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1477: for (i=0; i<mbs; i++) {
1478: ncols = ai[1] - ai[0]; ai++;
1479: brow = bs*i;
1480: for (j=0; j<ncols; j++) {
1481: bcol = bs*(*aj);
1482: for (kcol=0; kcol<bs; kcol++) {
1483: col = bcol + kcol; /* col index */
1484: for (krow=0; krow<bs; krow++) {
1485: atmp = PetscAbsScalar(*aa); aa++;
1486: row = brow + krow; /* row index */
1487: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1488: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1489: }
1490: }
1491: aj++;
1492: }
1493: }
1494: VecRestoreArray(v,&x);
1495: return(0);
1496: }
1498: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1499: {
1503: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1504: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1505: return(0);
1506: }
1508: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1509: {
1510: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1511: PetscScalar *z = c;
1512: const PetscScalar *xb;
1513: PetscScalar x1;
1514: const MatScalar *v = a->a,*vv;
1515: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1516: #if defined(PETSC_USE_COMPLEX)
1517: const int aconj = A->hermitian;
1518: #else
1519: const int aconj = 0;
1520: #endif
1523: for (i=0; i<mbs; i++) {
1524: n = ii[1] - ii[0]; ii++;
1525: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1526: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1527: jj = idx;
1528: vv = v;
1529: for (k=0; k<cn; k++) {
1530: idx = jj;
1531: v = vv;
1532: for (j=0; j<n; j++) {
1533: xb = b + (*idx); x1 = xb[0+k*bm];
1534: z[0+k*cm] += v[0]*x1;
1535: if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1536: v += 1;
1537: ++idx;
1538: }
1539: }
1540: z += 1;
1541: }
1542: return(0);
1543: }
1545: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1546: {
1547: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1548: PetscScalar *z = c;
1549: const PetscScalar *xb;
1550: PetscScalar x1,x2;
1551: const MatScalar *v = a->a,*vv;
1552: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1555: for (i=0; i<mbs; i++) {
1556: n = ii[1] - ii[0]; ii++;
1557: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1558: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1559: jj = idx;
1560: vv = v;
1561: for (k=0; k<cn; k++) {
1562: idx = jj;
1563: v = vv;
1564: for (j=0; j<n; j++) {
1565: xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1566: z[0+k*cm] += v[0]*x1 + v[2]*x2;
1567: z[1+k*cm] += v[1]*x1 + v[3]*x2;
1568: if (*idx != i) {
1569: c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1570: c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1571: }
1572: v += 4;
1573: ++idx;
1574: }
1575: }
1576: z += 2;
1577: }
1578: return(0);
1579: }
1581: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1582: {
1583: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1584: PetscScalar *z = c;
1585: const PetscScalar *xb;
1586: PetscScalar x1,x2,x3;
1587: const MatScalar *v = a->a,*vv;
1588: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1591: for (i=0; i<mbs; i++) {
1592: n = ii[1] - ii[0]; ii++;
1593: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1594: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1595: jj = idx;
1596: vv = v;
1597: for (k=0; k<cn; k++) {
1598: idx = jj;
1599: v = vv;
1600: for (j=0; j<n; j++) {
1601: xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1602: z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1603: z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1604: z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1605: if (*idx != i) {
1606: c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1607: c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1608: c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1609: }
1610: v += 9;
1611: ++idx;
1612: }
1613: }
1614: z += 3;
1615: }
1616: return(0);
1617: }
1619: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1620: {
1621: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1622: PetscScalar *z = c;
1623: const PetscScalar *xb;
1624: PetscScalar x1,x2,x3,x4;
1625: const MatScalar *v = a->a,*vv;
1626: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1629: for (i=0; i<mbs; i++) {
1630: n = ii[1] - ii[0]; ii++;
1631: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1632: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1633: jj = idx;
1634: vv = v;
1635: for (k=0; k<cn; k++) {
1636: idx = jj;
1637: v = vv;
1638: for (j=0; j<n; j++) {
1639: xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1640: z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1641: z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1642: z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1643: z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1644: if (*idx != i) {
1645: c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm];
1646: c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm];
1647: c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1648: c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1649: }
1650: v += 16;
1651: ++idx;
1652: }
1653: }
1654: z += 4;
1655: }
1656: return(0);
1657: }
1659: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1660: {
1661: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1662: PetscScalar *z = c;
1663: const PetscScalar *xb;
1664: PetscScalar x1,x2,x3,x4,x5;
1665: const MatScalar *v = a->a,*vv;
1666: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1669: for (i=0; i<mbs; i++) {
1670: n = ii[1] - ii[0]; ii++;
1671: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1672: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1673: jj = idx;
1674: vv = v;
1675: for (k=0; k<cn; k++) {
1676: idx = jj;
1677: v = vv;
1678: for (j=0; j<n; j++) {
1679: xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1680: z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1681: z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1682: z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1683: z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1684: z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1685: if (*idx != i) {
1686: c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1687: c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1688: c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1689: c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1690: c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1691: }
1692: v += 25;
1693: ++idx;
1694: }
1695: }
1696: z += 5;
1697: }
1698: return(0);
1699: }
1701: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1702: {
1703: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1704: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1705: Mat_SeqDense *cd = (Mat_SeqDense*)B->data;
1706: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1707: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1708: PetscBLASInt bbs,bcn,bbm,bcm;
1709: PetscScalar *z = 0;
1710: PetscScalar *c,*b;
1711: const MatScalar *v;
1712: const PetscInt *idx,*ii;
1713: PetscScalar _DOne=1.0;
1714: PetscErrorCode ierr;
1717: if (!cm || !cn) return(0);
1718: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1719: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1720: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1721: b = bd->v;
1722: MatZeroEntries(C);
1723: MatDenseGetArray(C,&c);
1724: switch (bs) {
1725: case 1:
1726: MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1727: break;
1728: case 2:
1729: MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1730: break;
1731: case 3:
1732: MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1733: break;
1734: case 4:
1735: MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1736: break;
1737: case 5:
1738: MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1739: break;
1740: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1741: PetscBLASIntCast(bs,&bbs);
1742: PetscBLASIntCast(cn,&bcn);
1743: PetscBLASIntCast(bm,&bbm);
1744: PetscBLASIntCast(cm,&bcm);
1745: idx = a->j;
1746: v = a->a;
1747: mbs = a->mbs;
1748: ii = a->i;
1749: z = c;
1750: for (i=0; i<mbs; i++) {
1751: n = ii[1] - ii[0]; ii++;
1752: for (j=0; j<n; j++) {
1753: if (*idx != i)
1754: PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1755: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1756: v += bs2;
1757: }
1758: z += bs;
1759: }
1760: }
1761: MatDenseRestoreArray(C,&c);
1762: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1763: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1764: PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);
1765: return(0);
1766: }