Actual source code: bstream.c
petsc-3.6.1 2015-08-06
1: #define PETSCMAT_DLL
3: #include <../src/mat/impls/baij/seq/baij.h>
4: #include <../src/mat/impls/baij/seq/bstream/bstream.h>
8: PetscErrorCode MatDestroy_SeqBSTRM(Mat A)
9: {
11: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*) A->spptr;
13: if (bstrm) {
14: PetscFree(bstrm->as);
15: }
16: PetscObjectChangeTypeName((PetscObject)A, MATSEQBAIJ);
17: MatDestroy_SeqBAIJ(A);
18: return(0);
19: }
20: /*=========================================================*/
21: PetscErrorCode MatDuplicate_SeqBSTRM(Mat A, MatDuplicateOption op, Mat *M)
22: {
24: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Cannot duplicate STRM matrices yet");
25: return(0);
26: }
27: /*=========================================================*/
28: extern PetscErrorCode MatSolve_SeqBSTRM_4(Mat A,Vec bb,Vec xx);
29: extern PetscErrorCode MatSolve_SeqBSTRM_5(Mat A,Vec bb,Vec xx);
32: /*=========================================================*/
35: PetscErrorCode MatSeqBSTRM_convert_bstrm(Mat A)
36: {
37: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*) A->data;
38: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*) A->spptr;
39: PetscInt m = a->mbs, bs = A->rmap->bs;
40: PetscInt *ai = a->i, *diag = a->diag;
41: PetscInt i,j,ib,jb;
42: MatScalar *aa = a->a;
44: PetscInt bs2, rbs, cbs, blen, slen;
45: PetscScalar **asp;
48: bstrm->rbs = bs;
49: bstrm->cbs = bs;
52: rbs = cbs = bs;
53: bs2 = rbs*cbs;
54: blen = ai[m]-ai[0]+diag[0] - diag[m];
55: slen = blen*cbs;
57: PetscFree(bstrm->as);
58: PetscMalloc1(bs2*blen, &bstrm->as);
60: PetscMalloc1(rbs, &asp);
62: for (i=0; i<rbs; i++) asp[i] = bstrm->as + i*slen;
64: for (j=0;j<blen;j++) {
65: for (jb=0; jb<cbs; jb++) {
66: for (ib=0; ib<rbs; ib++) asp[ib][j*cbs+jb] = aa[j*bs2+jb*rbs+ib];
67: }
68: }
70: PetscFree(asp);
71: switch (bs) {
72: case 4:
73: A->ops->solve = MatSolve_SeqBSTRM_4;
74: break;
75: case 5:
76: A->ops->solve = MatSolve_SeqBSTRM_5;
77: break;
78: default:
79: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
80: }
81: return(0);
82: }
84: /*=========================================================*/
85: extern PetscErrorCode MatSeqBSTRM_create_bstrm(Mat);
89: PetscErrorCode MatAssemblyEnd_SeqBSTRM(Mat A, MatAssemblyType mode)
90: {
94: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
95: MatAssemblyEnd_SeqBAIJ(A, mode);
97: MatSeqBSTRM_create_bstrm(A);
98: return(0);
99: }
100: /*=========================================================*/
103: PETSC_EXTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
104: {
106: Mat B = *newmat;
107: Mat_SeqBSTRM *bstrm;
110: if (reuse == MAT_INITIAL_MATRIX) {
111: MatDuplicate(A,MAT_COPY_VALUES,&B);
112: }
115: PetscNewLog(B,&bstrm);
116: B->spptr = (void*) bstrm;
118: /* Set function pointers for methods that we inherit from BAIJ but override. */
119: B->ops->duplicate = MatDuplicate_SeqBSTRM;
120: B->ops->assemblyend = MatAssemblyEnd_SeqBSTRM;
121: B->ops->destroy = MatDestroy_SeqBSTRM;
123: /* If A has already been assembled, compute the permutation. */
124: if (A->assembled) {
125: MatSeqBSTRM_create_bstrm(B);
126: }
127: PetscObjectChangeTypeName((PetscObject)B,MATSEQBSTRM);
128: *newmat = B;
129: return(0);
130: }
132: /*=========================================================*/
135: PetscErrorCode MatCreateSeqBSTRM(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
136: {
140: MatCreate(comm,A);
141: MatSetSizes(*A,m,n,m,n);
142: MatSetType(*A,MATSEQBSTRM);
143: MatSeqBAIJSetPreallocation_SeqBAIJ(*A,bs,nz,(PetscInt*)nnz);
144: (*A)->rmap->bs = bs;
145: return(0);
146: }
147: /*=========================================================*/
151: PETSC_EXTERN PetscErrorCode MatCreate_SeqBSTRM(Mat A)
152: {
156: MatSetType(A,MATSEQBAIJ);
157: MatConvert_SeqBAIJ_SeqBSTRM(A,MATSEQBSTRM,MAT_REUSE_MATRIX,&A);
159: PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_seqbstrm_C",MatConvert_SeqBAIJ_SeqBSTRM);
160: return(0);
161: }
163: /*=========================================================*/
165: /*=========================================================*/
168: PetscErrorCode MatSOR_SeqBSTRM_4(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
169: {
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171: PetscScalar *x,x1,x2,x3,x4,s1,s2,s3,s4;
172: const MatScalar *idiag,*mdiag;
173: const PetscScalar *b;
174: PetscErrorCode ierr;
175: PetscInt m = a->mbs,i,i2,nz,idx;
176: const PetscInt *diag,*ai = a->i,*aj = a->j,*vi;
178: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
179: MatScalar *v1,*v2,*v3,*v4, *v10,*v20,*v30,*v40;
180: PetscInt slen;
183: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
184: its = its*lits;
185: if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
186: if (fshift) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for diagonal shift");
187: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for non-trivial relaxation factor");
188: if ((flag & SOR_APPLY_UPPER) || (flag & SOR_APPLY_LOWER)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for applying upper or lower triangular parts");
189: if (its > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support yet for multiple point block SOR iterations");
191: if (!a->idiagvalid) {MatInvertBlockDiagonal(A,NULL);}
193: diag = a->diag;
194: idiag = a->idiag;
195: VecGetArray(xx,&x);
196: VecGetArray(bb,(PetscScalar**)&b);
198: slen = 4*(ai[m]-ai[0]);
199: v10 = bstrm->as;
200: v20 = v10 + slen;
201: v30 = v20 + slen;
202: v40 = v30 + slen;
204: if (flag & SOR_ZERO_INITIAL_GUESS) {
205: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
206: x[0] = b[0]*idiag[0] + b[1]*idiag[4] + b[2]*idiag[8] + b[3]*idiag[12];
207: x[1] = b[0]*idiag[1] + b[1]*idiag[5] + b[2]*idiag[9] + b[3]*idiag[13];
208: x[2] = b[0]*idiag[2] + b[1]*idiag[6] + b[2]*idiag[10] + b[3]*idiag[14];
209: x[3] = b[0]*idiag[3] + b[1]*idiag[7] + b[2]*idiag[11] + b[3]*idiag[15];
210: i2 = 4;
211: idiag += 16;
212: for (i=1; i<m; i++) {
213: v1 = v10 + 4*ai[i];
214: v2 = v20 + 4*ai[i];
215: v3 = v30 + 4*ai[i];
216: v4 = v40 + 4*ai[i];
217: vi = aj + ai[i];
218: nz = diag[i] - ai[i];
219: s1 = b[i2]; s2 = b[i2+1]; s3 = b[i2+2]; s4 = b[i2+3];
220: while (nz--) {
221: idx = 4*(*vi++);
222: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
223: s1 -= v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
224: s2 -= v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
225: s3 -= v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
226: s4 -= v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
227: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
228: }
229: x[i2] = idiag[0]*s1 + idiag[4]*s2 + idiag[8]*s3 + idiag[12]*s4;
230: x[i2+1] = idiag[1]*s1 + idiag[5]*s2 + idiag[9]*s3 + idiag[13]*s4;
231: x[i2+2] = idiag[2]*s1 + idiag[6]*s2 + idiag[10]*s3 + idiag[14]*s4;
232: x[i2+3] = idiag[3]*s1 + idiag[7]*s2 + idiag[11]*s3 + idiag[15]*s4;
233: idiag += 16;
234: i2 += 4;
235: }
236: /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
237: PetscLogFlops(16.0*(a->nz));
238: }
239: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
240: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
241: i2 = 0;
242: mdiag = a->idiag+16*a->mbs;
243: for (i=0; i<m; i++) {
244: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3];
245: x[i2] = mdiag[0]*x1 + mdiag[4]*x2 + mdiag[8]*x3 + mdiag[12]*x4;
246: x[i2+1] = mdiag[1]*x1 + mdiag[5]*x2 + mdiag[9]*x3 + mdiag[13]*x4;
247: x[i2+2] = mdiag[2]*x1 + mdiag[6]*x2 + mdiag[10]*x3 + mdiag[14]*x4;
248: x[i2+3] = mdiag[3]*x1 + mdiag[7]*x2 + mdiag[11]*x3 + mdiag[15]*x4;
249: mdiag += 16;
250: i2 += 4;
251: }
252: PetscLogFlops(28.0*m);
253: } else if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
254: PetscMemcpy(x,b,A->rmap->N*sizeof(PetscScalar));
255: }
256: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
257: idiag = a->idiag+16*a->mbs - 16;
258: i2 = 4*m - 4;
259: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3];
260: x[i2] = idiag[0]*x1 + idiag[4]*x2 + idiag[8]*x3 + idiag[12]*x4;
261: x[i2+1] = idiag[1]*x1 + idiag[5]*x2 + idiag[9]*x3 + idiag[13]*x4;
262: x[i2+2] = idiag[2]*x1 + idiag[6]*x2 + idiag[10]*x3 + idiag[14]*x4;
263: x[i2+3] = idiag[3]*x1 + idiag[7]*x2 + idiag[11]*x3 + idiag[15]*x4;
264: idiag -= 16;
265: i2 -= 4;
266: for (i=m-2; i>=0; i--) {
267: v1 = v10 + 4*(ai[i+1]-1);
268: v2 = v20 + 4*(ai[i+1]-1);
269: v3 = v30 + 4*(ai[i+1]-1);
270: v4 = v40 + 4*(ai[i+1]-1);
271: vi = aj + ai[i+1] - 1;
272: nz = ai[i+1] - diag[i] - 1;
273: s1 = x[i2]; s2 = x[i2+1]; s3 = x[i2+2]; s4 = x[i2+3];
274: while (nz--) {
275: idx = 4*(*vi--);
276: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
277: s1 -= v1[3]*x4 + v1[2]*x3 + v1[1]*x2 + v1[0]*x1;
278: s2 -= v2[3]*x4 + v2[2]*x3 + v2[1]*x2 + v2[0]*x1;
279: s3 -= v3[3]*x4 + v3[2]*x3 + v3[1]*x2 + v3[0]*x1;
280: s4 -= v4[3]*x4 + v4[2]*x3 + v4[1]*x2 + v4[0]*x1;
281: v1 -= 4; v2 -= 4; v3 -= 4; v4 -= 4;
282: }
283: x[i2] = idiag[0]*s1 + idiag[4]*s2 + idiag[8]*s3 + idiag[12]*s4;
284: x[i2+1] = idiag[1]*s1 + idiag[5]*s2 + idiag[9]*s3 + idiag[13]*s4;
285: x[i2+2] = idiag[2]*s1 + idiag[6]*s2 + idiag[10]*s3 + idiag[14]*s4;
286: x[i2+3] = idiag[3]*s1 + idiag[7]*s2 + idiag[11]*s3 + idiag[15]*s4;
287: idiag -= 16;
288: i2 -= 4;
289: }
290: PetscLogFlops(16.0*(a->nz));
291: }
292: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only supports point block SOR with zero initial guess");
293: VecRestoreArray(xx,&x);
294: VecRestoreArray(bb,(PetscScalar**)&b);
295: return(0);
296: }
297: /*=========================================================*/
298: /*=========================================================*/
301: PetscErrorCode MatSOR_SeqBSTRM_5(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
302: {
303: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
304: PetscScalar *x,x1,x2,x3,x4,x5,s1,s2,s3,s4,s5;
305: const MatScalar *idiag,*mdiag;
306: const PetscScalar *b;
307: PetscErrorCode ierr;
308: PetscInt m = a->mbs,i,i2,nz,idx;
309: const PetscInt *diag,*ai = a->i,*aj = a->j,*vi;
311: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
312: MatScalar *v1, *v2, *v3, *v4, *v5, *v10, *v20, *v30, *v40, *v50;
313: PetscInt slen;
316: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
317: its = its*lits;
318: if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
319: if (fshift) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for diagonal shift");
320: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for non-trivial relaxation factor");
321: if ((flag & SOR_APPLY_UPPER) || (flag & SOR_APPLY_LOWER)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for applying upper or lower triangular parts");
322: if (its > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support yet for multiple point block SOR iterations");
324: if (!a->idiagvalid) {MatInvertBlockDiagonal(A,NULL);}
326: diag = a->diag;
327: idiag = a->idiag;
328: VecGetArray(xx,&x);
329: VecGetArray(bb,(PetscScalar**)&b);
331: slen = 5*(ai[m]-ai[0]);
332: v10 = bstrm->as;
333: v20 = v10 + slen;
334: v30 = v20 + slen;
335: v40 = v30 + slen;
336: v50 = v40 + slen;
338: if (flag & SOR_ZERO_INITIAL_GUESS) {
339: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
340: x[0] = b[0]*idiag[0] + b[1]*idiag[5] + b[2]*idiag[10] + b[3]*idiag[15] + b[4]*idiag[20];
341: x[1] = b[0]*idiag[1] + b[1]*idiag[6] + b[2]*idiag[11] + b[3]*idiag[16] + b[4]*idiag[21];
342: x[2] = b[0]*idiag[2] + b[1]*idiag[7] + b[2]*idiag[12] + b[3]*idiag[17] + b[4]*idiag[22];
343: x[3] = b[0]*idiag[3] + b[1]*idiag[8] + b[2]*idiag[13] + b[3]*idiag[18] + b[4]*idiag[23];
344: x[4] = b[0]*idiag[4] + b[1]*idiag[9] + b[2]*idiag[14] + b[3]*idiag[19] + b[4]*idiag[24];
345: i2 = 5;
346: idiag += 25;
347: for (i=1; i<m; i++) {
348: v1 = v10 + 5*ai[i];
349: v2 = v20 + 5*ai[i];
350: v3 = v30 + 5*ai[i];
351: v4 = v40 + 5*ai[i];
352: v5 = v50 + 5*ai[i];
353: vi = aj + ai[i];
354: nz = diag[i] - ai[i];
355: s1 = b[i2]; s2 = b[i2+1]; s3 = b[i2+2]; s4 = b[i2+3]; s5 = b[i2+4];
356: while (nz--) {
357: idx = 5*(*vi++);
358: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
359: s1 -= v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
360: s2 -= v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
361: s3 -= v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
362: s4 -= v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
363: s5 -= v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
364: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
365: }
366: x[i2] = idiag[0]*s1 + idiag[5]*s2 + idiag[10]*s3 + idiag[15]*s4 + idiag[20]*s5;
367: x[i2+1] = idiag[1]*s1 + idiag[6]*s2 + idiag[11]*s3 + idiag[16]*s4 + idiag[21]*s5;
368: x[i2+2] = idiag[2]*s1 + idiag[7]*s2 + idiag[12]*s3 + idiag[17]*s4 + idiag[22]*s5;
369: x[i2+3] = idiag[3]*s1 + idiag[8]*s2 + idiag[13]*s3 + idiag[18]*s4 + idiag[23]*s5;
370: x[i2+4] = idiag[4]*s1 + idiag[9]*s2 + idiag[14]*s3 + idiag[19]*s4 + idiag[24]*s5;
371: idiag += 25;
372: i2 += 5;
373: }
374: /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
375: PetscLogFlops(25.0*(a->nz));
376: }
377: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
378: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
379: i2 = 0;
380: mdiag = a->idiag+25*a->mbs;
381: for (i=0; i<m; i++) {
382: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3]; x5 = x[i2+4];
383: x[i2] = mdiag[0]*x1 + mdiag[5]*x2 + mdiag[10]*x3 + mdiag[15]*x4 + mdiag[20]*x5;
384: x[i2+1] = mdiag[1]*x1 + mdiag[6]*x2 + mdiag[11]*x3 + mdiag[16]*x4 + mdiag[21]*x5;
385: x[i2+2] = mdiag[2]*x1 + mdiag[7]*x2 + mdiag[12]*x3 + mdiag[17]*x4 + mdiag[22]*x5;
386: x[i2+3] = mdiag[3]*x1 + mdiag[8]*x2 + mdiag[13]*x3 + mdiag[18]*x4 + mdiag[23]*x5;
387: x[i2+4] = mdiag[4]*x1 + mdiag[9]*x2 + mdiag[14]*x3 + mdiag[19]*x4 + mdiag[24]*x5;
388: mdiag += 25;
389: i2 += 5;
390: }
391: PetscLogFlops(45.0*m);
392: } else if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
393: PetscMemcpy(x,b,A->rmap->N*sizeof(PetscScalar));
394: }
395: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
396: idiag = a->idiag+25*a->mbs - 25;
397: i2 = 5*m - 5;
398: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3]; x5 = x[i2+4];
399: x[i2] = idiag[0]*x1 + idiag[5]*x2 + idiag[10]*x3 + idiag[15]*x4 + idiag[20]*x5;
400: x[i2+1] = idiag[1]*x1 + idiag[6]*x2 + idiag[11]*x3 + idiag[16]*x4 + idiag[21]*x5;
401: x[i2+2] = idiag[2]*x1 + idiag[7]*x2 + idiag[12]*x3 + idiag[17]*x4 + idiag[22]*x5;
402: x[i2+3] = idiag[3]*x1 + idiag[8]*x2 + idiag[13]*x3 + idiag[18]*x4 + idiag[23]*x5;
403: x[i2+4] = idiag[4]*x1 + idiag[9]*x2 + idiag[14]*x3 + idiag[19]*x4 + idiag[24]*x5;
404: idiag -= 25;
405: i2 -= 5;
406: for (i=m-2; i>=0; i--) {
407: v1 = v10 + 5*(ai[i+1]-1);
408: v2 = v20 + 5*(ai[i+1]-1);
409: v3 = v30 + 5*(ai[i+1]-1);
410: v4 = v40 + 5*(ai[i+1]-1);
411: v5 = v50 + 5*(ai[i+1]-1);
412: vi = aj + ai[i+1] - 1;
413: nz = ai[i+1] - diag[i] - 1;
414: s1 = x[i2]; s2 = x[i2+1]; s3 = x[i2+2]; s4 = x[i2+3]; s5 = x[i2+4];
415: while (nz--) {
416: idx = 5*(*vi--);
417: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
418: s1 -= v1[4]*x5 + v1[3]*x4 + v1[2]*x3 + v1[1]*x2 + v1[0]*x1;
419: s2 -= v2[4]*x5 + v2[3]*x4 + v2[2]*x3 + v2[1]*x2 + v2[0]*x1;
420: s3 -= v3[4]*x5 + v3[3]*x4 + v3[2]*x3 + v3[1]*x2 + v3[0]*x1;
421: s4 -= v4[4]*x5 + v4[3]*x4 + v4[2]*x3 + v4[1]*x2 + v4[0]*x1;
422: s5 -= v5[4]*x5 + v5[3]*x4 + v5[2]*x3 + v5[1]*x2 + v5[0]*x1;
423: v1 -= 5; v2 -= 5; v3 -= 5; v4 -= 5; v5 -= 5;
424: }
425: x[i2] = idiag[0]*s1 + idiag[5]*s2 + idiag[10]*s3 + idiag[15]*s4 + idiag[20]*s5;
426: x[i2+1] = idiag[1]*s1 + idiag[6]*s2 + idiag[11]*s3 + idiag[16]*s4 + idiag[21]*s5;
427: x[i2+2] = idiag[2]*s1 + idiag[7]*s2 + idiag[12]*s3 + idiag[17]*s4 + idiag[22]*s5;
428: x[i2+3] = idiag[3]*s1 + idiag[8]*s2 + idiag[13]*s3 + idiag[18]*s4 + idiag[23]*s5;
429: x[i2+4] = idiag[4]*s1 + idiag[9]*s2 + idiag[14]*s3 + idiag[19]*s4 + idiag[24]*s5;
430: idiag -= 25;
431: i2 -= 5;
432: }
433: PetscLogFlops(25.0*(a->nz));
434: }
435: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only supports point block SOR with zero initial guess");
436: VecRestoreArray(xx,&x);
437: VecRestoreArray(bb,(PetscScalar**)&b);
438: return(0);
439: }
440: /*=========================================================*/
441: /*=========================================================*/
444: PetscErrorCode MatMult_SeqBSTRM_4(Mat A,Vec xx,Vec zz)
445: {
446: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
447: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
448: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
449: const PetscScalar *x,*xb;
450: const MatScalar *v1, *v2, *v3, *v4;
451: PetscErrorCode ierr;
452: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
453: PetscBool usecprow=a->compressedrow.use;
454: PetscInt slen;
457: VecGetArray(xx,(PetscScalar**)&x);
458: VecGetArray(zz,&zarray);
460: idx = a->j;
462: if (usecprow) {
463: mbs = a->compressedrow.nrows;
464: ii = a->compressedrow.i;
465: ridx = a->compressedrow.rindex;
466: } else {
467: mbs = a->mbs;
468: ii = a->i;
469: z = zarray;
470: }
472: slen = 4*(ii[mbs]-ii[0]);
473: v1 = bstrm->as;
474: v2 = v1 + slen;
475: v3 = v2 + slen;
476: v4 = v3 + slen;
478: for (i=0; i<mbs; i++) {
479: n = ii[1] - ii[0]; ii++;
480: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
481: nonzerorow += (n>0);
482: for (j=0; j<n; j++) {
483: xb = x + 4*(*idx++);
484: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
485: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
486: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
487: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
488: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
489: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
490: }
491: if (usecprow) z = zarray + 4*ridx[i];
492: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
493: if (!usecprow) z += 4;
494: }
495: VecRestoreArray(xx,(PetscScalar**)&x);
496: VecRestoreArray(zz,&zarray);
497: PetscLogFlops(32*a->nz - 4*nonzerorow);
498: return(0);
499: }
500: /*=========================================================*/
503: PetscErrorCode MatMult_SeqBSTRM_5(Mat A,Vec xx,Vec zz)
504: {
505: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
506: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
507: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
508: const PetscScalar *x,*xb;
509: const MatScalar *v1, *v2, *v3, *v4, *v5;
510: PetscErrorCode ierr;
511: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
512: PetscBool usecprow=a->compressedrow.use;
513: PetscInt slen;
516: VecGetArray(xx,(PetscScalar**)&x);
517: VecGetArray(zz,&zarray);
519: idx = a->j;
521: if (usecprow) {
522: mbs = a->compressedrow.nrows;
523: ii = a->compressedrow.i;
524: ridx = a->compressedrow.rindex;
525: } else {
526: mbs = a->mbs;
527: ii = a->i;
528: z = zarray;
529: }
531: slen = 5*(ii[mbs]-ii[0]);
532: v1 = bstrm->as;
533: v2 = v1 + slen;
534: v3 = v2 + slen;
535: v4 = v3 + slen;
536: v5 = v4 + slen;
538: for (i=0; i<mbs; i++) {
539: n = ii[1] - ii[0]; ii++;
540: sum1 = sum2 = sum3 = sum4 = sum5 = 0.0;
541: nonzerorow += (n>0);
542: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
543: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
544: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
546: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
547: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
550: for (j=0; j<n; j++) {
551: xb = x + 5*(*idx++);
552: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
553: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
554: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
555: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
556: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
557: sum5 += v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
558: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
559: }
560: if (usecprow) z = zarray + 5*ridx[i];
561: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
562: if (!usecprow) z += 5;
563: }
564: VecRestoreArray(xx,(PetscScalar**)&x);
565: VecRestoreArray(zz,&zarray);
566: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
567: return(0);
568: }
569: /*=========================================================*/
571: /*=========================================================*/
574: PetscErrorCode MatMultTranspose_SeqBSTRM_4(Mat A,Vec xx,Vec zz)
575: {
576: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
577: Mat_SeqBSTRM *sbstrm = (Mat_SeqBSTRM*)A->spptr;
578: PetscScalar zero = 0.0;
579: PetscScalar x1,x2,x3,x4;
580: PetscScalar *x, *xb, *z;
581: MatScalar *v1, *v2, *v3, *v4;
583: PetscInt mbs =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j;
584: PetscInt nonzerorow=0;
585: PetscInt slen;
588: VecSet(zz,zero);
589: VecGetArray(xx,&x);
590: VecGetArray(zz,&z);
592: slen = 4*(ai[mbs]-ai[0]);
593: v1 = sbstrm->as;
594: v2 = v1 + slen;
595: v3 = v2 + slen;
596: v4 = v3 + slen;
597: xb = x;
599: for (i=0; i<mbs; i++) {
600: n = ai[i+1] - ai[i];
601: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; xb += 4;
602: nonzerorow += (n>0);
603: ib = aj + ai[i];
605: for (j=0; j<n; j++) {
606: cval = ib[j]*4;
607: z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
608: z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
609: z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
610: z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
611: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
612: }
614: }
615: VecRestoreArray(xx,&x);
616: VecRestoreArray(zz,&z);
617: PetscLogFlops(32*a->nz - 4*nonzerorow);
618: return(0);
619: }
620: /*=========================================================*/
623: PetscErrorCode MatMultTranspose_SeqBSTRM_5(Mat A,Vec xx,Vec zz)
624: {
625: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
626: Mat_SeqBSTRM *sbstrm = (Mat_SeqBSTRM*)A->spptr;
627: PetscScalar zero = 0.0;
628: PetscScalar *z = 0;
629: PetscScalar *x,*xb;
630: const MatScalar *v1, *v2, *v3, *v4, *v5;
631: PetscScalar x1, x2, x3, x4, x5;
632: PetscErrorCode ierr;
633: PetscInt mbs =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j;
634: PetscInt nonzerorow=0;
635: PetscInt slen;
638: VecSet(zz,zero);
639: VecGetArray(xx,&x);
640: VecGetArray(zz,&z);
642: slen = 5*(ai[mbs]-ai[0]);
644: v1 = sbstrm->as;
645: v2 = v1 + slen;
646: v3 = v2 + slen;
647: v4 = v3 + slen;
648: v5 = v4 + slen;
650: xb = x;
652: for (i=0; i<mbs; i++) {
653: n = ai[i+1] - ai[i];
654: nonzerorow += (n>0);
656: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; xb += 5;
657: ib = aj + ai[i];
659: PetscPrefetchBlock(ib+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
660: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
661: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
662: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
663: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
664: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
667: for (j=0; j<n; j++) {
668: cval = ib[j]*5;
669: z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
670: z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
671: z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
672: z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
673: z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;
674: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
675: }
676: }
677: VecRestoreArray(xx,&x);
678: VecRestoreArray(zz,&z);
679: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
680: return(0);
681: }
682: /*=========================================================*/
685: PetscErrorCode MatMultAdd_SeqBSTRM_4(Mat A,Vec xx,Vec yy,Vec zz)
686: {
687: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
688: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
689: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
690: MatScalar *v1, *v2, *v3, *v4;
692: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
693: PetscBool usecprow=a->compressedrow.use;
694: PetscInt slen;
697: VecGetArray(xx,&x);
698: VecGetArray(yy,&yarray);
699: if (zz != yy) {
700: VecGetArray(zz,&zarray);
701: } else {
702: zarray = yarray;
703: }
705: idx = a->j;
706: if (usecprow) {
707: if (zz != yy) {
708: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
709: }
710: mbs = a->compressedrow.nrows;
711: ii = a->compressedrow.i;
712: ridx = a->compressedrow.rindex;
713: } else {
714: ii = a->i;
715: y = yarray;
716: z = zarray;
717: }
719: slen = 4*(ii[mbs]-ii[0]);
720: v1 = bstrm->as;
721: v2 = v1 + slen;
722: v3 = v2 + slen;
723: v4 = v3 + slen;
725: for (i=0; i<mbs; i++) {
726: n = ii[1] - ii[0]; ii++;
727: if (usecprow) {
728: z = zarray + 4*ridx[i];
729: y = yarray + 4*ridx[i];
730: }
731: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
732: for (j=0; j<n; j++) {
733: xb = x + 4*(*idx++);
734: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
735: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
736: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
737: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
738: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
739: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
740: }
741: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
742: if (!usecprow) {
743: z += 4; y += 4;
744: }
745: }
746: VecRestoreArray(xx,&x);
747: VecRestoreArray(yy,&yarray);
748: if (zz != yy) {
749: VecRestoreArray(zz,&zarray);
750: }
751: PetscLogFlops(32.0*a->nz);
752: return(0);
753: }
755: /*=========================================================*/
758: PetscErrorCode MatMultAdd_SeqBSTRM_5(Mat A,Vec xx,Vec yy,Vec zz)
759: {
760: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
761: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*)A->spptr;
762: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
763: PetscScalar *yarray,*zarray;
764: MatScalar *v1,*v2,*v3,*v4,*v5;
766: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
767: PetscBool usecprow=a->compressedrow.use;
768: PetscInt slen;
771: VecGetArray(xx,&x);
772: VecGetArray(yy,&yarray);
773: if (zz != yy) {
774: VecGetArray(zz,&zarray);
775: } else {
776: zarray = yarray;
777: }
780: idx = a->j;
781: if (usecprow) {
782: if (zz != yy) {
783: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
784: }
785: mbs = a->compressedrow.nrows;
786: ii = a->compressedrow.i;
787: ridx = a->compressedrow.rindex;
788: } else {
789: ii = a->i;
790: y = yarray;
791: z = zarray;
792: }
794: slen = 5*(ii[mbs]-ii[0]);
795: v1 = bstrm->as;
796: v2 = v1 + slen;
797: v3 = v2 + slen;
798: v4 = v3 + slen;
799: v5 = v4 + slen;
802: for (i=0; i<mbs; i++) {
803: n = ii[1] - ii[0]; ii++;
804: if (usecprow) {
805: z = zarray + 5*ridx[i];
806: y = yarray + 5*ridx[i];
807: }
808: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
809: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
810: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
811: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
812: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
813: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
814: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
817: for (j=0; j<n; j++) {
818: xb = x + 5*(*idx++);
819: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
820: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
821: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
822: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
823: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
824: sum5 += v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
825: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
826: }
827: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
828: if (!usecprow) {
829: z += 5; y += 5;
830: }
831: }
832: VecRestoreArray(xx,&x);
833: VecRestoreArray(yy,&yarray);
834: if (zz != yy) {
835: VecRestoreArray(zz,&zarray);
836: }
837: PetscLogFlops(50.0*a->nz);
838: return(0);
839: }
841: /*=========================================================*/
844: PetscErrorCode MatSeqBSTRM_create_bstrm(Mat A)
845: {
846: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*) A->data;
847: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*) A->spptr;
848: PetscInt MROW = a->mbs, bs = A->rmap->bs;
849: PetscInt *ai = a->i;
850: PetscInt i,j,k;
851: MatScalar *aa = a->a;
853: PetscInt bs2, rbs, cbs, blen, slen;
854: PetscScalar **asp;
857: bstrm->rbs = bstrm->cbs = bs;
859: rbs = cbs = bs;
860: bs2 = rbs*cbs;
861: blen = ai[MROW]-ai[0];
862: slen = blen*cbs;
864: PetscMalloc1(bs2*blen, &bstrm->as);
866: PetscMalloc1(rbs, &asp);
868: for (i=0; i<rbs; i++) asp[i] = bstrm->as + i*slen;
870: for (k=0; k<blen; k++) {
871: for (j=0; j<cbs; j++) {
872: for (i=0; i<rbs; i++) asp[i][k*cbs+j] = aa[k*bs2+j*rbs+i];
873: }
874: }
876: PetscFree(asp);
878: switch (bs) {
879: case 4:
880: A->ops->mult = MatMult_SeqBSTRM_4;
881: A->ops->multadd = MatMultAdd_SeqBSTRM_4;
882: A->ops->multtranspose = MatMultTranspose_SeqBSTRM_4;
883: A->ops->sor = MatSOR_SeqBSTRM_4;
884: break;
885: case 5:
886: A->ops->mult = MatMult_SeqBSTRM_5;
887: A->ops->multadd = MatMultAdd_SeqBSTRM_5;
888: A->ops->multtranspose = MatMultTranspose_SeqBSTRM_5;
889: A->ops->sor = MatSOR_SeqBSTRM_5;
890: break;
891: default:
892: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
893: }
894: return(0);
895: }
896: /*=========================================================*/
897: /*=========================================================*/
901: PetscErrorCode MatSeqBSTRMTransform(Mat A)
902: {
904: MatSeqBSTRM_convert_bstrm(A);
905: return(0);
906: }