Actual source code: sbaij2.c
petsc-3.12.5 2020-03-29
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
12: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13: const PetscInt *idx;
14: PetscBT table_out,table_in;
17: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18: mbs = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = A->rmap->bs;
22: PetscBTCreate(mbs,&table_out);
23: PetscMalloc1(mbs+1,&nidx);
24: PetscMalloc1(A->rmap->N+1,&nidx2);
25: PetscBTCreate(mbs,&table_in);
27: for (i=0; i<is_max; i++) { /* for each is */
28: isz = 0;
29: PetscBTMemzero(mbs,table_out);
31: /* Extract the indices, assume there can be duplicate entries */
32: ISGetIndices(is[i],&idx);
33: ISGetLocalSize(is[i],&n);
35: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36: bcol_max = 0;
37: for (j=0; j<n; ++j) {
38: brow = idx[j]/bs; /* convert the indices into block indices */
39: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40: if (!PetscBTLookupSet(table_out,brow)) {
41: nidx[isz++] = brow;
42: if (bcol_max < brow) bcol_max = brow;
43: }
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap */
50: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51: PetscBTMemzero(mbs,table_in);
52: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
54: n = isz; /* length of the updated is[i] */
55: for (brow=0; brow<mbs; brow++) {
56: start = ai[brow]; end = ai[brow+1];
57: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58: for (l = start; l<end; l++) {
59: bcol = aj[l];
60: if (!PetscBTLookupSet(table_out,bcol)) {
61: nidx[isz++] = bcol;
62: if (bcol_max < bcol) bcol_max = bcol;
63: }
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l<end; l++) {
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table_in,bcol)) {
72: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
80: /* expand the Index Set */
81: for (j=0; j<isz; j++) {
82: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83: }
84: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
85: }
86: PetscBTDestroy(&table_out);
87: PetscFree(nidx);
88: PetscFree(nidx2);
89: PetscBTDestroy(&table_in);
90: return(0);
91: }
93: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94: Zero some ops' to avoid invalid usse */
95: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96: {
100: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
101: Bseq->ops->mult = 0;
102: Bseq->ops->multadd = 0;
103: Bseq->ops->multtranspose = 0;
104: Bseq->ops->multtransposeadd = 0;
105: Bseq->ops->lufactor = 0;
106: Bseq->ops->choleskyfactor = 0;
107: Bseq->ops->lufactorsymbolic = 0;
108: Bseq->ops->choleskyfactorsymbolic = 0;
109: Bseq->ops->getinertia = 0;
110: return(0);
111: }
113: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
114: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115: {
116: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
118: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
120: const PetscInt *irow,*icol;
121: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122: PetscInt *aj = a->j,*ai = a->i;
123: MatScalar *mat_a;
124: Mat C;
125: PetscBool flag;
129: ISGetIndices(isrow,&irow);
130: ISGetIndices(iscol,&icol);
131: ISGetLocalSize(isrow,&nrows);
132: ISGetLocalSize(iscol,&ncols);
134: PetscCalloc1(1+oldcols,&smap);
135: ssmap = smap;
136: PetscMalloc1(1+nrows,&lens);
137: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138: /* determine lens of each row */
139: for (i=0; i<nrows; i++) {
140: kstart = ai[irow[i]];
141: kend = kstart + a->ilen[irow[i]];
142: lens[i] = 0;
143: for (k=kstart; k<kend; k++) {
144: if (ssmap[aj[k]]) lens[i]++;
145: }
146: }
147: /* Create and fill new matrix */
148: if (scall == MAT_REUSE_MATRIX) {
149: c = (Mat_SeqSBAIJ*)((*B)->data);
151: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
153: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154: PetscArrayzero(c->ilen,c->mbs);
155: C = *B;
156: } else {
157: MatCreate(PetscObjectComm((PetscObject)A),&C);
158: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
159: MatSetType(C,((PetscObject)A)->type_name);
160: MatSeqSBAIJSetPreallocation(C,bs,0,lens);
161: }
162: c = (Mat_SeqSBAIJ*)(C->data);
163: for (i=0; i<nrows; i++) {
164: row = irow[i];
165: kstart = ai[row];
166: kend = kstart + a->ilen[row];
167: mat_i = c->i[i];
168: mat_j = c->j + mat_i;
169: mat_a = c->a + mat_i*bs2;
170: mat_ilen = c->ilen + i;
171: for (k=kstart; k<kend; k++) {
172: if ((tcol=ssmap[a->j[k]])) {
173: *mat_j++ = tcol - 1;
174: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
175: mat_a += bs2;
176: (*mat_ilen)++;
177: }
178: }
179: }
180: /* sort */
181: {
182: MatScalar *work;
184: PetscMalloc1(bs2,&work);
185: for (i=0; i<nrows; i++) {
186: PetscInt ilen;
187: mat_i = c->i[i];
188: mat_j = c->j + mat_i;
189: mat_a = c->a + mat_i*bs2;
190: ilen = c->ilen[i];
191: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
192: }
193: PetscFree(work);
194: }
196: /* Free work space */
197: ISRestoreIndices(iscol,&icol);
198: PetscFree(smap);
199: PetscFree(lens);
200: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
201: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
203: ISRestoreIndices(isrow,&irow);
204: *B = C;
205: return(0);
206: }
208: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
209: {
210: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
211: IS is1,is2;
213: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214: const PetscInt *irow,*icol;
217: ISGetIndices(isrow,&irow);
218: ISGetIndices(iscol,&icol);
219: ISGetLocalSize(isrow,&nrows);
220: ISGetLocalSize(iscol,&ncols);
222: /* Verify if the indices corespond to each element in a block
223: and form the IS with compressed IS */
224: maxmnbs = PetscMax(a->mbs,a->nbs);
225: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
226: PetscArrayzero(vary,a->mbs);
227: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228: for (i=0; i<a->mbs; i++) {
229: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230: }
231: count = 0;
232: for (i=0; i<nrows; i++) {
233: PetscInt j = irow[i] / bs;
234: if ((vary[j]--)==bs) iary[count++] = j;
235: }
236: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
238: PetscArrayzero(vary,a->nbs);
239: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240: for (i=0; i<a->nbs; i++) {
241: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242: }
243: count = 0;
244: for (i=0; i<ncols; i++) {
245: PetscInt j = icol[i] / bs;
246: if ((vary[j]--)==bs) iary[count++] = j;
247: }
248: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
249: ISRestoreIndices(isrow,&irow);
250: ISRestoreIndices(iscol,&icol);
251: PetscFree2(vary,iary);
253: MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
254: ISDestroy(&is1);
255: ISDestroy(&is2);
257: if (isrow != iscol) {
258: PetscBool isequal;
259: ISEqual(isrow,iscol,&isequal);
260: if (!isequal) {
261: MatSeqSBAIJZeroOps_Private(*B);
262: }
263: }
264: return(0);
265: }
267: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
268: {
270: PetscInt i;
273: if (scall == MAT_INITIAL_MATRIX) {
274: PetscCalloc1(n+1,B);
275: }
277: for (i=0; i<n; i++) {
278: MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
279: }
280: return(0);
281: }
283: /* -------------------------------------------------------*/
284: /* Should check that shapes of vectors and matrices match */
285: /* -------------------------------------------------------*/
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
290: PetscScalar *z,x1,x2,zero=0.0;
291: const PetscScalar *x,*xb;
292: const MatScalar *v;
293: PetscErrorCode ierr;
294: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
295: const PetscInt *aj=a->j,*ai=a->i,*ib;
296: PetscInt nonzerorow=0;
299: VecSet(zz,zero);
300: if (!a->nz) return(0);
301: VecGetArrayRead(xx,&x);
302: VecGetArray(zz,&z);
304: v = a->a;
305: xb = x;
307: for (i=0; i<mbs; i++) {
308: n = ai[1] - ai[0]; /* length of i_th block row of A */
309: x1 = xb[0]; x2 = xb[1];
310: ib = aj + *ai;
311: jmin = 0;
312: nonzerorow += (n>0);
313: if (*ib == i) { /* (diag of A)*x */
314: z[2*i] += v[0]*x1 + v[2]*x2;
315: z[2*i+1] += v[2]*x1 + v[3]*x2;
316: v += 4; jmin++;
317: }
318: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
319: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
320: for (j=jmin; j<n; j++) {
321: /* (strict lower triangular part of A)*x */
322: cval = ib[j]*2;
323: z[cval] += v[0]*x1 + v[1]*x2;
324: z[cval+1] += v[2]*x1 + v[3]*x2;
325: /* (strict upper triangular part of A)*x */
326: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
327: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
328: v += 4;
329: }
330: xb +=2; ai++;
331: }
333: VecRestoreArrayRead(xx,&x);
334: VecRestoreArray(zz,&z);
335: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
336: return(0);
337: }
339: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
340: {
341: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
342: PetscScalar *z,x1,x2,x3,zero=0.0;
343: const PetscScalar *x,*xb;
344: const MatScalar *v;
345: PetscErrorCode ierr;
346: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
347: const PetscInt *aj = a->j,*ai = a->i,*ib;
348: PetscInt nonzerorow=0;
351: VecSet(zz,zero);
352: if (!a->nz) return(0);
353: VecGetArrayRead(xx,&x);
354: VecGetArray(zz,&z);
356: v = a->a;
357: xb = x;
359: for (i=0; i<mbs; i++) {
360: n = ai[1] - ai[0]; /* length of i_th block row of A */
361: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
362: ib = aj + *ai;
363: jmin = 0;
364: nonzerorow += (n>0);
365: if (*ib == i) { /* (diag of A)*x */
366: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
367: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
368: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
369: v += 9; jmin++;
370: }
371: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
373: for (j=jmin; j<n; j++) {
374: /* (strict lower triangular part of A)*x */
375: cval = ib[j]*3;
376: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
377: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
378: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
379: /* (strict upper triangular part of A)*x */
380: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
381: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
382: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
383: v += 9;
384: }
385: xb +=3; ai++;
386: }
388: VecRestoreArrayRead(xx,&x);
389: VecRestoreArray(zz,&z);
390: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
391: return(0);
392: }
394: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
395: {
396: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
397: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
398: const PetscScalar *x,*xb;
399: const MatScalar *v;
400: PetscErrorCode ierr;
401: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
402: const PetscInt *aj = a->j,*ai = a->i,*ib;
403: PetscInt nonzerorow = 0;
406: VecSet(zz,zero);
407: if (!a->nz) return(0);
408: VecGetArrayRead(xx,&x);
409: VecGetArray(zz,&z);
411: v = a->a;
412: xb = x;
414: for (i=0; i<mbs; i++) {
415: n = ai[1] - ai[0]; /* length of i_th block row of A */
416: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417: ib = aj + *ai;
418: jmin = 0;
419: nonzerorow += (n>0);
420: if (*ib == i) { /* (diag of A)*x */
421: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
422: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
423: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
424: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
425: v += 16; jmin++;
426: }
427: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
428: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429: for (j=jmin; j<n; j++) {
430: /* (strict lower triangular part of A)*x */
431: cval = ib[j]*4;
432: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
433: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
434: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
435: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
436: /* (strict upper triangular part of A)*x */
437: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
438: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
439: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
440: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
441: v += 16;
442: }
443: xb +=4; ai++;
444: }
446: VecRestoreArrayRead(xx,&x);
447: VecRestoreArray(zz,&z);
448: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
449: return(0);
450: }
452: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
453: {
454: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
455: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
456: const PetscScalar *x,*xb;
457: const MatScalar *v;
458: PetscErrorCode ierr;
459: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
460: const PetscInt *aj = a->j,*ai = a->i,*ib;
461: PetscInt nonzerorow=0;
464: VecSet(zz,zero);
465: if (!a->nz) return(0);
466: VecGetArrayRead(xx,&x);
467: VecGetArray(zz,&z);
469: v = a->a;
470: xb = x;
472: for (i=0; i<mbs; i++) {
473: n = ai[1] - ai[0]; /* length of i_th block row of A */
474: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
475: ib = aj + *ai;
476: jmin = 0;
477: nonzerorow += (n>0);
478: if (*ib == i) { /* (diag of A)*x */
479: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
480: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
481: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
482: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
483: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
484: v += 25; jmin++;
485: }
486: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
487: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488: for (j=jmin; j<n; j++) {
489: /* (strict lower triangular part of A)*x */
490: cval = ib[j]*5;
491: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
492: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
493: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
494: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
495: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
496: /* (strict upper triangular part of A)*x */
497: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
498: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
499: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
500: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
501: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
502: v += 25;
503: }
504: xb +=5; ai++;
505: }
507: VecRestoreArrayRead(xx,&x);
508: VecRestoreArray(zz,&z);
509: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
510: return(0);
511: }
514: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
517: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518: const PetscScalar *x,*xb;
519: const MatScalar *v;
520: PetscErrorCode ierr;
521: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
522: const PetscInt *aj=a->j,*ai=a->i,*ib;
523: PetscInt nonzerorow=0;
526: VecSet(zz,zero);
527: if (!a->nz) return(0);
528: VecGetArrayRead(xx,&x);
529: VecGetArray(zz,&z);
531: v = a->a;
532: xb = x;
534: for (i=0; i<mbs; i++) {
535: n = ai[1] - ai[0]; /* length of i_th block row of A */
536: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537: ib = aj + *ai;
538: jmin = 0;
539: nonzerorow += (n>0);
540: if (*ib == i) { /* (diag of A)*x */
541: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547: v += 36; jmin++;
548: }
549: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551: for (j=jmin; j<n; j++) {
552: /* (strict lower triangular part of A)*x */
553: cval = ib[j]*6;
554: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560: /* (strict upper triangular part of A)*x */
561: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567: v += 36;
568: }
569: xb +=6; ai++;
570: }
572: VecRestoreArrayRead(xx,&x);
573: VecRestoreArray(zz,&z);
574: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
575: return(0);
576: }
577: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
578: {
579: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
580: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
581: const PetscScalar *x,*xb;
582: const MatScalar *v;
583: PetscErrorCode ierr;
584: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
585: const PetscInt *aj=a->j,*ai=a->i,*ib;
586: PetscInt nonzerorow=0;
589: VecSet(zz,zero);
590: if (!a->nz) return(0);
591: VecGetArrayRead(xx,&x);
592: VecGetArray(zz,&z);
594: v = a->a;
595: xb = x;
597: for (i=0; i<mbs; i++) {
598: n = ai[1] - ai[0]; /* length of i_th block row of A */
599: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
600: ib = aj + *ai;
601: jmin = 0;
602: nonzerorow += (n>0);
603: if (*ib == i) { /* (diag of A)*x */
604: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
605: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
606: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
607: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
608: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
609: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
610: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
611: v += 49; jmin++;
612: }
613: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
614: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
615: for (j=jmin; j<n; j++) {
616: /* (strict lower triangular part of A)*x */
617: cval = ib[j]*7;
618: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
619: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
620: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
621: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
622: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
623: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
624: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
625: /* (strict upper triangular part of A)*x */
626: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
627: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
628: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
629: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
630: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
631: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
632: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
633: v += 49;
634: }
635: xb +=7; ai++;
636: }
637: VecRestoreArrayRead(xx,&x);
638: VecRestoreArray(zz,&z);
639: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
640: return(0);
641: }
643: /*
644: This will not work with MatScalar == float because it calls the BLAS
645: */
646: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
647: {
648: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
649: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
650: const PetscScalar *x,*x_ptr,*xb;
651: const MatScalar *v;
652: PetscErrorCode ierr;
653: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
654: const PetscInt *idx,*aj,*ii;
655: PetscInt nonzerorow=0;
658: VecSet(zz,zero);
659: if (!a->nz) return(0);
660: VecGetArrayRead(xx,&x);x_ptr = x;
661: VecGetArray(zz,&z); z_ptr=z;
663: aj = a->j;
664: v = a->a;
665: ii = a->i;
667: if (!a->mult_work) {
668: PetscMalloc1(A->rmap->N+1,&a->mult_work);
669: }
670: work = a->mult_work;
672: for (i=0; i<mbs; i++) {
673: n = ii[1] - ii[0]; ncols = n*bs;
674: workt = work; idx=aj+ii[0];
675: nonzerorow += (n>0);
677: /* upper triangular part */
678: for (j=0; j<n; j++) {
679: xb = x_ptr + bs*(*idx++);
680: for (k=0; k<bs; k++) workt[k] = xb[k];
681: workt += bs;
682: }
683: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
684: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
686: /* strict lower triangular part */
687: idx = aj+ii[0];
688: if (*idx == i) {
689: ncols -= bs; v += bs2; idx++; n--;
690: }
692: if (ncols > 0) {
693: workt = work;
694: PetscArrayzero(workt,ncols);
695: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
696: for (j=0; j<n; j++) {
697: zb = z_ptr + bs*(*idx++);
698: for (k=0; k<bs; k++) zb[k] += workt[k];
699: workt += bs;
700: }
701: }
702: x += bs; v += n*bs2; z += bs; ii++;
703: }
705: VecRestoreArrayRead(xx,&x);
706: VecRestoreArray(zz,&z);
707: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
708: return(0);
709: }
711: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
712: {
713: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
714: PetscScalar *z,x1;
715: const PetscScalar *x,*xb;
716: const MatScalar *v;
717: PetscErrorCode ierr;
718: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
719: const PetscInt *aj=a->j,*ai=a->i,*ib;
720: PetscInt nonzerorow=0;
723: VecCopy(yy,zz);
724: if (!a->nz) return(0);
725: VecGetArrayRead(xx,&x);
726: VecGetArray(zz,&z);
727: v = a->a;
728: xb = x;
730: for (i=0; i<mbs; i++) {
731: n = ai[1] - ai[0]; /* length of i_th row of A */
732: x1 = xb[0];
733: ib = aj + *ai;
734: jmin = 0;
735: nonzerorow += (n>0);
736: if (*ib == i) { /* (diag of A)*x */
737: z[i] += *v++ * x[*ib++]; jmin++;
738: }
739: for (j=jmin; j<n; j++) {
740: cval = *ib;
741: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
742: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
743: }
744: xb++; ai++;
745: }
747: VecRestoreArrayRead(xx,&x);
748: VecRestoreArray(zz,&z);
750: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
751: return(0);
752: }
754: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
755: {
756: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
757: PetscScalar *z,x1,x2;
758: const PetscScalar *x,*xb;
759: const MatScalar *v;
760: PetscErrorCode ierr;
761: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
762: const PetscInt *aj=a->j,*ai=a->i,*ib;
763: PetscInt nonzerorow=0;
766: VecCopy(yy,zz);
767: if (!a->nz) return(0);
768: VecGetArrayRead(xx,&x);
769: VecGetArray(zz,&z);
771: v = a->a;
772: xb = x;
774: for (i=0; i<mbs; i++) {
775: n = ai[1] - ai[0]; /* length of i_th block row of A */
776: x1 = xb[0]; x2 = xb[1];
777: ib = aj + *ai;
778: jmin = 0;
779: nonzerorow += (n>0);
780: if (*ib == i) { /* (diag of A)*x */
781: z[2*i] += v[0]*x1 + v[2]*x2;
782: z[2*i+1] += v[2]*x1 + v[3]*x2;
783: v += 4; jmin++;
784: }
785: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
786: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
787: for (j=jmin; j<n; j++) {
788: /* (strict lower triangular part of A)*x */
789: cval = ib[j]*2;
790: z[cval] += v[0]*x1 + v[1]*x2;
791: z[cval+1] += v[2]*x1 + v[3]*x2;
792: /* (strict upper triangular part of A)*x */
793: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
794: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
795: v += 4;
796: }
797: xb +=2; ai++;
798: }
799: VecRestoreArrayRead(xx,&x);
800: VecRestoreArray(zz,&z);
802: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
803: return(0);
804: }
806: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
807: {
808: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
809: PetscScalar *z,x1,x2,x3;
810: const PetscScalar *x,*xb;
811: const MatScalar *v;
812: PetscErrorCode ierr;
813: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
814: const PetscInt *aj=a->j,*ai=a->i,*ib;
815: PetscInt nonzerorow=0;
818: VecCopy(yy,zz);
819: if (!a->nz) return(0);
820: VecGetArrayRead(xx,&x);
821: VecGetArray(zz,&z);
823: v = a->a;
824: xb = x;
826: for (i=0; i<mbs; i++) {
827: n = ai[1] - ai[0]; /* length of i_th block row of A */
828: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
829: ib = aj + *ai;
830: jmin = 0;
831: nonzerorow += (n>0);
832: if (*ib == i) { /* (diag of A)*x */
833: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
834: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
835: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836: v += 9; jmin++;
837: }
838: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
839: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
840: for (j=jmin; j<n; j++) {
841: /* (strict lower triangular part of A)*x */
842: cval = ib[j]*3;
843: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
844: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
845: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
846: /* (strict upper triangular part of A)*x */
847: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
848: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
849: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
850: v += 9;
851: }
852: xb +=3; ai++;
853: }
855: VecRestoreArrayRead(xx,&x);
856: VecRestoreArray(zz,&z);
858: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
859: return(0);
860: }
862: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863: {
864: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
865: PetscScalar *z,x1,x2,x3,x4;
866: const PetscScalar *x,*xb;
867: const MatScalar *v;
868: PetscErrorCode ierr;
869: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
870: const PetscInt *aj=a->j,*ai=a->i,*ib;
871: PetscInt nonzerorow=0;
874: VecCopy(yy,zz);
875: if (!a->nz) return(0);
876: VecGetArrayRead(xx,&x);
877: VecGetArray(zz,&z);
879: v = a->a;
880: xb = x;
882: for (i=0; i<mbs; i++) {
883: n = ai[1] - ai[0]; /* length of i_th block row of A */
884: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885: ib = aj + *ai;
886: jmin = 0;
887: nonzerorow += (n>0);
888: if (*ib == i) { /* (diag of A)*x */
889: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
890: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
891: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
892: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893: v += 16; jmin++;
894: }
895: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897: for (j=jmin; j<n; j++) {
898: /* (strict lower triangular part of A)*x */
899: cval = ib[j]*4;
900: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
901: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
902: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
903: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
904: /* (strict upper triangular part of A)*x */
905: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
906: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
907: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
908: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
909: v += 16;
910: }
911: xb +=4; ai++;
912: }
914: VecRestoreArrayRead(xx,&x);
915: VecRestoreArray(zz,&z);
917: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
918: return(0);
919: }
921: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
922: {
923: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
924: PetscScalar *z,x1,x2,x3,x4,x5;
925: const PetscScalar *x,*xb;
926: const MatScalar *v;
927: PetscErrorCode ierr;
928: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
929: const PetscInt *aj=a->j,*ai=a->i,*ib;
930: PetscInt nonzerorow=0;
933: VecCopy(yy,zz);
934: if (!a->nz) return(0);
935: VecGetArrayRead(xx,&x);
936: VecGetArray(zz,&z);
938: v = a->a;
939: xb = x;
941: for (i=0; i<mbs; i++) {
942: n = ai[1] - ai[0]; /* length of i_th block row of A */
943: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
944: ib = aj + *ai;
945: jmin = 0;
946: nonzerorow += (n>0);
947: if (*ib == i) { /* (diag of A)*x */
948: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
949: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
950: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
951: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
952: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953: v += 25; jmin++;
954: }
955: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
956: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
957: for (j=jmin; j<n; j++) {
958: /* (strict lower triangular part of A)*x */
959: cval = ib[j]*5;
960: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
961: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
962: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
963: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
964: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
965: /* (strict upper triangular part of A)*x */
966: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
967: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
968: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
969: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
970: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
971: v += 25;
972: }
973: xb +=5; ai++;
974: }
976: VecRestoreArrayRead(xx,&x);
977: VecRestoreArray(zz,&z);
979: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
980: return(0);
981: }
982: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
983: {
984: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
985: PetscScalar *z,x1,x2,x3,x4,x5,x6;
986: const PetscScalar *x,*xb;
987: const MatScalar *v;
988: PetscErrorCode ierr;
989: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
990: const PetscInt *aj=a->j,*ai=a->i,*ib;
991: PetscInt nonzerorow=0;
994: VecCopy(yy,zz);
995: if (!a->nz) return(0);
996: VecGetArrayRead(xx,&x);
997: VecGetArray(zz,&z);
999: v = a->a;
1000: xb = x;
1002: for (i=0; i<mbs; i++) {
1003: n = ai[1] - ai[0]; /* length of i_th block row of A */
1004: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1005: ib = aj + *ai;
1006: jmin = 0;
1007: nonzerorow += (n>0);
1008: if (*ib == i) { /* (diag of A)*x */
1009: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1010: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1011: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1012: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1013: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1014: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015: v += 36; jmin++;
1016: }
1017: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019: for (j=jmin; j<n; j++) {
1020: /* (strict lower triangular part of A)*x */
1021: cval = ib[j]*6;
1022: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1023: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1024: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1025: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1026: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1027: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1028: /* (strict upper triangular part of A)*x */
1029: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1030: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1031: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1032: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1033: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1034: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1035: v += 36;
1036: }
1037: xb +=6; ai++;
1038: }
1040: VecRestoreArrayRead(xx,&x);
1041: VecRestoreArray(zz,&z);
1043: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1044: return(0);
1045: }
1047: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1048: {
1049: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1050: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1051: const PetscScalar *x,*xb;
1052: const MatScalar *v;
1053: PetscErrorCode ierr;
1054: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1055: const PetscInt *aj=a->j,*ai=a->i,*ib;
1056: PetscInt nonzerorow=0;
1059: VecCopy(yy,zz);
1060: if (!a->nz) return(0);
1061: VecGetArrayRead(xx,&x);
1062: VecGetArray(zz,&z);
1064: v = a->a;
1065: xb = x;
1067: for (i=0; i<mbs; i++) {
1068: n = ai[1] - ai[0]; /* length of i_th block row of A */
1069: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1070: ib = aj + *ai;
1071: jmin = 0;
1072: nonzerorow += (n>0);
1073: if (*ib == i) { /* (diag of A)*x */
1074: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1075: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1076: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1077: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1078: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1079: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1080: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081: v += 49; jmin++;
1082: }
1083: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1084: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085: for (j=jmin; j<n; j++) {
1086: /* (strict lower triangular part of A)*x */
1087: cval = ib[j]*7;
1088: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1089: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1090: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1091: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1092: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1093: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1094: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1095: /* (strict upper triangular part of A)*x */
1096: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1097: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1098: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1099: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1100: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1101: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1102: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1103: v += 49;
1104: }
1105: xb +=7; ai++;
1106: }
1108: VecRestoreArrayRead(xx,&x);
1109: VecRestoreArray(zz,&z);
1111: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1112: return(0);
1113: }
1115: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1116: {
1117: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1118: PetscScalar *z,*z_ptr=0,*zb,*work,*workt;
1119: const PetscScalar *x,*x_ptr,*xb;
1120: const MatScalar *v;
1121: PetscErrorCode ierr;
1122: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1123: const PetscInt *idx,*aj,*ii;
1124: PetscInt nonzerorow=0;
1127: VecCopy(yy,zz);
1128: if (!a->nz) return(0);
1129: VecGetArrayRead(xx,&x); x_ptr=x;
1130: VecGetArray(zz,&z); z_ptr=z;
1132: aj = a->j;
1133: v = a->a;
1134: ii = a->i;
1136: if (!a->mult_work) {
1137: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1138: }
1139: work = a->mult_work;
1142: for (i=0; i<mbs; i++) {
1143: n = ii[1] - ii[0]; ncols = n*bs;
1144: workt = work; idx=aj+ii[0];
1145: nonzerorow += (n>0);
1147: /* upper triangular part */
1148: for (j=0; j<n; j++) {
1149: xb = x_ptr + bs*(*idx++);
1150: for (k=0; k<bs; k++) workt[k] = xb[k];
1151: workt += bs;
1152: }
1153: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1154: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1156: /* strict lower triangular part */
1157: idx = aj+ii[0];
1158: if (*idx == i) {
1159: ncols -= bs; v += bs2; idx++; n--;
1160: }
1161: if (ncols > 0) {
1162: workt = work;
1163: PetscArrayzero(workt,ncols);
1164: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1165: for (j=0; j<n; j++) {
1166: zb = z_ptr + bs*(*idx++);
1167: for (k=0; k<bs; k++) zb[k] += workt[k];
1168: workt += bs;
1169: }
1170: }
1172: x += bs; v += n*bs2; z += bs; ii++;
1173: }
1175: VecRestoreArrayRead(xx,&x);
1176: VecRestoreArray(zz,&z);
1178: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1179: return(0);
1180: }
1182: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1183: {
1184: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1185: PetscScalar oalpha = alpha;
1187: PetscBLASInt one = 1,totalnz;
1190: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1191: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1192: PetscLogFlops(totalnz);
1193: return(0);
1194: }
1196: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1197: {
1198: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1199: const MatScalar *v = a->a;
1200: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1201: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1202: PetscErrorCode ierr;
1203: const PetscInt *aj=a->j,*col;
1206: if (!a->nz) {
1207: *norm = 0.0;
1208: return(0);
1209: }
1210: if (type == NORM_FROBENIUS) {
1211: for (k=0; k<mbs; k++) {
1212: jmin = a->i[k]; jmax = a->i[k+1];
1213: col = aj + jmin;
1214: if (*col == k) { /* diagonal block */
1215: for (i=0; i<bs2; i++) {
1216: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1217: }
1218: jmin++;
1219: }
1220: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1221: for (i=0; i<bs2; i++) {
1222: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1223: }
1224: }
1225: }
1226: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1227: PetscLogFlops(2*bs2*a->nz);
1228: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1229: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1230: for (i=0; i<mbs; i++) jl[i] = mbs;
1231: il[0] = 0;
1233: *norm = 0.0;
1234: for (k=0; k<mbs; k++) { /* k_th block row */
1235: for (j=0; j<bs; j++) sum[j]=0.0;
1236: /*-- col sum --*/
1237: i = jl[k]; /* first |A(i,k)| to be added */
1238: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1239: at step k */
1240: while (i<mbs) {
1241: nexti = jl[i]; /* next block row to be added */
1242: ik = il[i]; /* block index of A(i,k) in the array a */
1243: for (j=0; j<bs; j++) {
1244: v = a->a + ik*bs2 + j*bs;
1245: for (k1=0; k1<bs; k1++) {
1246: sum[j] += PetscAbsScalar(*v); v++;
1247: }
1248: }
1249: /* update il, jl */
1250: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1251: jmax = a->i[i+1];
1252: if (jmin < jmax) {
1253: il[i] = jmin;
1254: j = a->j[jmin];
1255: jl[i] = jl[j]; jl[j]=i;
1256: }
1257: i = nexti;
1258: }
1259: /*-- row sum --*/
1260: jmin = a->i[k]; jmax = a->i[k+1];
1261: for (i=jmin; i<jmax; i++) {
1262: for (j=0; j<bs; j++) {
1263: v = a->a + i*bs2 + j;
1264: for (k1=0; k1<bs; k1++) {
1265: sum[j] += PetscAbsScalar(*v); v += bs;
1266: }
1267: }
1268: }
1269: /* add k_th block row to il, jl */
1270: col = aj+jmin;
1271: if (*col == k) jmin++;
1272: if (jmin < jmax) {
1273: il[k] = jmin;
1274: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1275: }
1276: for (j=0; j<bs; j++) {
1277: if (sum[j] > *norm) *norm = sum[j];
1278: }
1279: }
1280: PetscFree3(sum,il,jl);
1281: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1282: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1283: return(0);
1284: }
1286: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1287: {
1288: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1292: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1293: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1294: *flg = PETSC_FALSE;
1295: return(0);
1296: }
1298: /* if the a->i are the same */
1299: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1300: if (!*flg) return(0);
1302: /* if a->j are the same */
1303: PetscArraycmp(a->j,b->j,a->nz,flg);
1304: if (!*flg) return(0);
1306: /* if a->a are the same */
1307: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1308: return(0);
1309: }
1311: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1312: {
1313: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1314: PetscErrorCode ierr;
1315: PetscInt i,j,k,row,bs,ambs,bs2;
1316: const PetscInt *ai,*aj;
1317: PetscScalar *x,zero = 0.0;
1318: const MatScalar *aa,*aa_j;
1321: bs = A->rmap->bs;
1322: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1324: aa = a->a;
1325: ambs = a->mbs;
1327: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1328: PetscInt *diag=a->diag;
1329: aa = a->a;
1330: ambs = a->mbs;
1331: VecGetArray(v,&x);
1332: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1333: VecRestoreArray(v,&x);
1334: return(0);
1335: }
1337: ai = a->i;
1338: aj = a->j;
1339: bs2 = a->bs2;
1340: VecSet(v,zero);
1341: if (!a->nz) return(0);
1342: VecGetArray(v,&x);
1343: for (i=0; i<ambs; i++) {
1344: j=ai[i];
1345: if (aj[j] == i) { /* if this is a diagonal element */
1346: row = i*bs;
1347: aa_j = aa + j*bs2;
1348: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1349: }
1350: }
1351: VecRestoreArray(v,&x);
1352: return(0);
1353: }
1355: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1356: {
1357: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1358: PetscScalar x;
1359: const PetscScalar *l,*li,*ri;
1360: MatScalar *aa,*v;
1361: PetscErrorCode ierr;
1362: PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1363: const PetscInt *ai,*aj;
1364: PetscBool flg;
1367: if (ll != rr) {
1368: VecEqual(ll,rr,&flg);
1369: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1370: }
1371: if (!ll) return(0);
1372: ai = a->i;
1373: aj = a->j;
1374: aa = a->a;
1375: m = A->rmap->N;
1376: bs = A->rmap->bs;
1377: mbs = a->mbs;
1378: bs2 = a->bs2;
1380: VecGetArrayRead(ll,&l);
1381: VecGetLocalSize(ll,&lm);
1382: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1383: for (i=0; i<mbs; i++) { /* for each block row */
1384: M = ai[i+1] - ai[i];
1385: li = l + i*bs;
1386: v = aa + bs2*ai[i];
1387: for (j=0; j<M; j++) { /* for each block */
1388: ri = l + bs*aj[ai[i]+j];
1389: for (k=0; k<bs; k++) {
1390: x = ri[k];
1391: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1392: }
1393: }
1394: }
1395: VecRestoreArrayRead(ll,&l);
1396: PetscLogFlops(2.0*a->nz);
1397: return(0);
1398: }
1400: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1401: {
1402: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1405: info->block_size = a->bs2;
1406: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1407: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1408: info->nz_unneeded = info->nz_allocated - info->nz_used;
1409: info->assemblies = A->num_ass;
1410: info->mallocs = A->info.mallocs;
1411: info->memory = ((PetscObject)A)->mem;
1412: if (A->factortype) {
1413: info->fill_ratio_given = A->info.fill_ratio_given;
1414: info->fill_ratio_needed = A->info.fill_ratio_needed;
1415: info->factor_mallocs = A->info.factor_mallocs;
1416: } else {
1417: info->fill_ratio_given = 0;
1418: info->fill_ratio_needed = 0;
1419: info->factor_mallocs = 0;
1420: }
1421: return(0);
1422: }
1425: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1426: {
1427: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1431: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1432: return(0);
1433: }
1435: /*
1436: This code does not work since it only checks the upper triangular part of
1437: the matrix. Hence it is not listed in the function table.
1438: */
1439: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1440: {
1441: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1442: PetscErrorCode ierr;
1443: PetscInt i,j,n,row,col,bs,mbs;
1444: const PetscInt *ai,*aj;
1445: PetscReal atmp;
1446: const MatScalar *aa;
1447: PetscScalar *x;
1448: PetscInt ncols,brow,bcol,krow,kcol;
1451: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1452: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1453: bs = A->rmap->bs;
1454: aa = a->a;
1455: ai = a->i;
1456: aj = a->j;
1457: mbs = a->mbs;
1459: VecSet(v,0.0);
1460: VecGetArray(v,&x);
1461: VecGetLocalSize(v,&n);
1462: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1463: for (i=0; i<mbs; i++) {
1464: ncols = ai[1] - ai[0]; ai++;
1465: brow = bs*i;
1466: for (j=0; j<ncols; j++) {
1467: bcol = bs*(*aj);
1468: for (kcol=0; kcol<bs; kcol++) {
1469: col = bcol + kcol; /* col index */
1470: for (krow=0; krow<bs; krow++) {
1471: atmp = PetscAbsScalar(*aa); aa++;
1472: row = brow + krow; /* row index */
1473: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1474: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1475: }
1476: }
1477: aj++;
1478: }
1479: }
1480: VecRestoreArray(v,&x);
1481: return(0);
1482: }