Actual source code: bstream.c
petsc-3.3-p7 2013-05-11
1: #define PETSCMAT_DLL
3: #include ../src/mat/impls/baij/seq/baij.h
4: #include ../src/mat/impls/baij/seq/bstream/bstream.h
8: PetscErrorCode MatDestroy_SeqBSTRM(Mat A)
9: {
10: PetscErrorCode ierr;
11: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *) A->spptr;
13: if(bstrm) {
14: PetscFree(bstrm->as);
15: }
16: PetscObjectChangeTypeName( (PetscObject)A, MATSEQBAIJ);
17: MatDestroy_SeqBAIJ(A);
18: return(0);
19: }
20: /*=========================================================*/
21: PetscErrorCode MatDuplicate_SeqBSTRM(Mat A, MatDuplicateOption op, Mat *M)
22: {
24: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Cannot duplicate STRM matrices yet");
25: return(0);
26: }
27: /*=========================================================*/
28: extern PetscErrorCode MatSolve_SeqBSTRM_4(Mat A,Vec bb,Vec xx);
29: extern PetscErrorCode MatSolve_SeqBSTRM_5(Mat A,Vec bb,Vec xx);
32: /*=========================================================*/
35: PetscErrorCode MatSeqBSTRM_convert_bstrm(Mat A)
36: {
37: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
38: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*) A->spptr;
39: PetscInt m = a->mbs, bs = A->rmap->bs;
40: PetscInt *ai = a->i, *diag = a->diag;
41: PetscInt i,j,ib,jb;
42: MatScalar *aa = a->a;
44: PetscInt bs2, rbs, cbs, blen, slen;
45: PetscScalar **asp;
48: bstrm->rbs = bs;
49: bstrm->cbs = bs;
52: rbs = cbs = bs;
53: bs2 = rbs*cbs;
54: blen = ai[m]-ai[0]+diag[0] - diag[m];
55: slen = blen*cbs;
57: PetscFree(bstrm->as);
58: PetscMalloc(bs2*blen*sizeof(MatScalar), &bstrm->as);
60: PetscMalloc(rbs*sizeof(MatScalar *), &asp);
61:
62: for(i=0;i<rbs;i++) asp[i] = bstrm->as + i*slen;
64: for(j=0;j<blen;j++) {
65: for (jb=0; jb<cbs; jb++){
66: for (ib=0; ib<rbs; ib++){
67: asp[ib][j*cbs+jb] = aa[j*bs2+jb*rbs+ib];
68: }}
69: }
71: PetscFree(asp);
72: switch (bs){
73: case 4:
74: A->ops->solve = MatSolve_SeqBSTRM_4;
75: break;
76: case 5:
77: A->ops->solve = MatSolve_SeqBSTRM_5;
78: break;
79: default:
80: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
81: }
82: return(0);
83: }
85: /*=========================================================*/
86: extern PetscErrorCode MatSeqBSTRM_create_bstrm(Mat);
90: PetscErrorCode MatAssemblyEnd_SeqBSTRM(Mat A, MatAssemblyType mode)
91: {
95: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
96: MatAssemblyEnd_SeqBAIJ(A, mode);
98: MatSeqBSTRM_create_bstrm(A);
99: return(0);
100: }
101: /*=========================================================*/
102: EXTERN_C_BEGIN
105: PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat A,const MatType type,MatReuse reuse,Mat *newmat)
106: {
108: Mat B = *newmat;
109: Mat_SeqBSTRM *bstrm;
112: if (reuse == MAT_INITIAL_MATRIX) {
113: MatDuplicate(A,MAT_COPY_VALUES,&B);
114: }
115:
117: PetscNewLog(B,Mat_SeqBSTRM,&bstrm);
118: B->spptr = (void *) bstrm;
120: /* Set function pointers for methods that we inherit from BAIJ but override. */
121: B->ops->duplicate = MatDuplicate_SeqBSTRM;
122: B->ops->assemblyend = MatAssemblyEnd_SeqBSTRM;
123: B->ops->destroy = MatDestroy_SeqBSTRM;
125: /* If A has already been assembled, compute the permutation. */
126: if (A->assembled) {
127: MatSeqBSTRM_create_bstrm(B);
128: }
129: PetscObjectChangeTypeName((PetscObject)B,MATSEQBSTRM);
130: *newmat = B;
131: return(0);
132: }
133: EXTERN_C_END
135: /*=========================================================*/
138: PetscErrorCode MatCreateSeqBSTRM(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
139: {
142: MatCreate(comm,A);
143: MatSetSizes(*A,m,n,m,n);
144: MatSetType(*A,MATSEQBSTRM);
145: MatSeqBAIJSetPreallocation_SeqBAIJ(*A,bs,nz,(PetscInt*)nnz);
146: (*A)->rmap->bs = bs;
147: return(0);
148: }
149: /*=========================================================*/
150: EXTERN_C_BEGIN
153: PetscErrorCode MatCreate_SeqBSTRM(Mat A)
154: {
158: MatSetType(A,MATSEQBAIJ);
159: MatConvert_SeqBAIJ_SeqBSTRM(A,MATSEQBSTRM,MAT_REUSE_MATRIX,&A);
161: PetscObjectComposeFunctionDynamic((PetscObject)A,"MatConvert_seqbaij_seqbstrm_C",
162: "MatConvert_SeqBAIJ_SeqBSTRM",
163: MatConvert_SeqBAIJ_SeqBSTRM);
164: return(0);
165: }
166: EXTERN_C_END
167: /*=========================================================*/
169: /*=========================================================*/
172: PetscErrorCode MatSOR_SeqBSTRM_4(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
173: {
174: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
175: PetscScalar *x,x1,x2,x3,x4,s1,s2,s3,s4;
176: const MatScalar *idiag,*mdiag;
177: const PetscScalar *b;
178: PetscErrorCode ierr;
179: PetscInt m = a->mbs,i,i2,nz,idx;
180: const PetscInt *diag,*ai = a->i,*aj = a->j,*vi;
182: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
183: MatScalar *v1,*v2,*v3,*v4, *v10,*v20,*v30,*v40;
184: PetscInt slen;
187: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
188: its = its*lits;
189: if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
190: if (fshift) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for diagonal shift");
191: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for non-trivial relaxation factor");
192: if ((flag & SOR_APPLY_UPPER) || (flag & SOR_APPLY_LOWER)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for applying upper or lower triangular parts");
193: if (its > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support yet for multiple point block SOR iterations");
195: if (!a->idiagvalid){MatInvertBlockDiagonal(A,PETSC_NULL);}
197: diag = a->diag;
198: idiag = a->idiag;
199: VecGetArray(xx,&x);
200: VecGetArray(bb,(PetscScalar**)&b);
202: slen = 4*(ai[m]-ai[0]);
203: v10 = bstrm->as;
204: v20 = v10 + slen;
205: v30 = v20 + slen;
206: v40 = v30 + slen;
208: if (flag & SOR_ZERO_INITIAL_GUESS) {
209: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
210: x[0] = b[0]*idiag[0] + b[1]*idiag[4] + b[2]*idiag[8] + b[3]*idiag[12];
211: x[1] = b[0]*idiag[1] + b[1]*idiag[5] + b[2]*idiag[9] + b[3]*idiag[13];
212: x[2] = b[0]*idiag[2] + b[1]*idiag[6] + b[2]*idiag[10] + b[3]*idiag[14];
213: x[3] = b[0]*idiag[3] + b[1]*idiag[7] + b[2]*idiag[11] + b[3]*idiag[15];
214: i2 = 4;
215: idiag += 16;
216: for (i=1; i<m; i++) {
217: v1 = v10 + 4*ai[i];
218: v2 = v20 + 4*ai[i];
219: v3 = v30 + 4*ai[i];
220: v4 = v40 + 4*ai[i];
221: vi = aj + ai[i];
222: nz = diag[i] - ai[i];
223: s1 = b[i2]; s2 = b[i2+1]; s3 = b[i2+2]; s4 = b[i2+3];
224: while (nz--) {
225: idx = 4*(*vi++);
226: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
227: s1 -= v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
228: s2 -= v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
229: s3 -= v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
230: s4 -= v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
231: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
232: }
233: x[i2] = idiag[0]*s1 + idiag[4]*s2 + idiag[8]*s3 + idiag[12]*s4;
234: x[i2+1] = idiag[1]*s1 + idiag[5]*s2 + idiag[9]*s3 + idiag[13]*s4;
235: x[i2+2] = idiag[2]*s1 + idiag[6]*s2 + idiag[10]*s3 + idiag[14]*s4;
236: x[i2+3] = idiag[3]*s1 + idiag[7]*s2 + idiag[11]*s3 + idiag[15]*s4;
237: idiag += 16;
238: i2 += 4;
239: }
240: /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
241: PetscLogFlops(16.0*(a->nz));
242: }
243: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
244: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
245: i2 = 0;
246: mdiag = a->idiag+16*a->mbs;
247: for (i=0; i<m; i++) {
248: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3];
249: x[i2] = mdiag[0]*x1 + mdiag[4]*x2 + mdiag[8]*x3 + mdiag[12]*x4;
250: x[i2+1] = mdiag[1]*x1 + mdiag[5]*x2 + mdiag[9]*x3 + mdiag[13]*x4;
251: x[i2+2] = mdiag[2]*x1 + mdiag[6]*x2 + mdiag[10]*x3 + mdiag[14]*x4;
252: x[i2+3] = mdiag[3]*x1 + mdiag[7]*x2 + mdiag[11]*x3 + mdiag[15]*x4;
253: mdiag += 16;
254: i2 += 4;
255: }
256: PetscLogFlops(28.0*m);
257: } else if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
258: PetscMemcpy(x,b,A->rmap->N*sizeof(PetscScalar));
259: }
260: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
261: idiag = a->idiag+16*a->mbs - 16;
262: i2 = 4*m - 4;
263: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3];
264: x[i2] = idiag[0]*x1 + idiag[4]*x2 + idiag[8]*x3 + idiag[12]*x4;
265: x[i2+1] = idiag[1]*x1 + idiag[5]*x2 + idiag[9]*x3 + idiag[13]*x4;
266: x[i2+2] = idiag[2]*x1 + idiag[6]*x2 + idiag[10]*x3 + idiag[14]*x4;
267: x[i2+3] = idiag[3]*x1 + idiag[7]*x2 + idiag[11]*x3 + idiag[15]*x4;
268: idiag -= 16;
269: i2 -= 4;
270: for (i=m-2; i>=0; i--) {
271: v1 = v10 + 4*(ai[i+1]-1);
272: v2 = v20 + 4*(ai[i+1]-1);
273: v3 = v30 + 4*(ai[i+1]-1);
274: v4 = v40 + 4*(ai[i+1]-1);
275: vi = aj + ai[i+1] - 1;
276: nz = ai[i+1] - diag[i] - 1;
277: s1 = x[i2]; s2 = x[i2+1]; s3 = x[i2+2]; s4 = x[i2+3];
278: while (nz--) {
279: idx = 4*(*vi--);
280: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
281: s1 -= v1[3]*x4 + v1[2]*x3 + v1[1]*x2 + v1[0]*x1;
282: s2 -= v2[3]*x4 + v2[2]*x3 + v2[1]*x2 + v2[0]*x1;
283: s3 -= v3[3]*x4 + v3[2]*x3 + v3[1]*x2 + v3[0]*x1;
284: s4 -= v4[3]*x4 + v4[2]*x3 + v4[1]*x2 + v4[0]*x1;
285: v1 -= 4; v2 -= 4; v3 -= 4; v4 -= 4;
286: }
287: x[i2] = idiag[0]*s1 + idiag[4]*s2 + idiag[8]*s3 + idiag[12]*s4;
288: x[i2+1] = idiag[1]*s1 + idiag[5]*s2 + idiag[9]*s3 + idiag[13]*s4;
289: x[i2+2] = idiag[2]*s1 + idiag[6]*s2 + idiag[10]*s3 + idiag[14]*s4;
290: x[i2+3] = idiag[3]*s1 + idiag[7]*s2 + idiag[11]*s3 + idiag[15]*s4;
291: idiag -= 16;
292: i2 -= 4;
293: }
294: PetscLogFlops(16.0*(a->nz));
295: }
296: } else {
297: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only supports point block SOR with zero initial guess");
298: }
299: VecRestoreArray(xx,&x);
300: VecRestoreArray(bb,(PetscScalar**)&b);
301: return(0);
302: }
303: /*=========================================================*/
304: /*=========================================================*/
307: PetscErrorCode MatSOR_SeqBSTRM_5(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
308: {
309: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
310: PetscScalar *x,x1,x2,x3,x4,x5,s1,s2,s3,s4,s5;
311: const MatScalar *idiag,*mdiag;
312: const PetscScalar *b;
313: PetscErrorCode ierr;
314: PetscInt m = a->mbs,i,i2,nz,idx;
315: const PetscInt *diag,*ai = a->i,*aj = a->j,*vi;
317: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
318: MatScalar *v1, *v2, *v3, *v4, *v5, *v10, *v20, *v30, *v40, *v50;
319: PetscInt slen;
322: if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
323: its = its*lits;
324: if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
325: if (fshift) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for diagonal shift");
326: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for non-trivial relaxation factor");
327: if ((flag & SOR_APPLY_UPPER) || (flag & SOR_APPLY_LOWER)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support for applying upper or lower triangular parts");
328: if (its > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Sorry, no support yet for multiple point block SOR iterations");
330: if (!a->idiagvalid){MatInvertBlockDiagonal(A,PETSC_NULL);}
332: diag = a->diag;
333: idiag = a->idiag;
334: VecGetArray(xx,&x);
335: VecGetArray(bb,(PetscScalar**)&b);
337: slen = 5*(ai[m]-ai[0]);
338: v10 = bstrm->as;
339: v20 = v10 + slen;
340: v30 = v20 + slen;
341: v40 = v30 + slen;
342: v50 = v40 + slen;
344: if (flag & SOR_ZERO_INITIAL_GUESS) {
345: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
346: x[0] = b[0]*idiag[0] + b[1]*idiag[5] + b[2]*idiag[10] + b[3]*idiag[15] + b[4]*idiag[20];
347: x[1] = b[0]*idiag[1] + b[1]*idiag[6] + b[2]*idiag[11] + b[3]*idiag[16] + b[4]*idiag[21];
348: x[2] = b[0]*idiag[2] + b[1]*idiag[7] + b[2]*idiag[12] + b[3]*idiag[17] + b[4]*idiag[22];
349: x[3] = b[0]*idiag[3] + b[1]*idiag[8] + b[2]*idiag[13] + b[3]*idiag[18] + b[4]*idiag[23];
350: x[4] = b[0]*idiag[4] + b[1]*idiag[9] + b[2]*idiag[14] + b[3]*idiag[19] + b[4]*idiag[24];
351: i2 = 5;
352: idiag += 25;
353: for (i=1; i<m; i++) {
354: v1 = v10 + 5*ai[i];
355: v2 = v20 + 5*ai[i];
356: v3 = v30 + 5*ai[i];
357: v4 = v40 + 5*ai[i];
358: v5 = v50 + 5*ai[i];
359: vi = aj + ai[i];
360: nz = diag[i] - ai[i];
361: s1 = b[i2]; s2 = b[i2+1]; s3 = b[i2+2]; s4 = b[i2+3]; s5 = b[i2+4];
362: while (nz--) {
363: idx = 5*(*vi++);
364: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
365: s1 -= v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
366: s2 -= v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
367: s3 -= v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
368: s4 -= v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
369: s5 -= v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
370: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
371: }
372: x[i2] = idiag[0]*s1 + idiag[5]*s2 + idiag[10]*s3 + idiag[15]*s4 + idiag[20]*s5;
373: x[i2+1] = idiag[1]*s1 + idiag[6]*s2 + idiag[11]*s3 + idiag[16]*s4 + idiag[21]*s5;
374: x[i2+2] = idiag[2]*s1 + idiag[7]*s2 + idiag[12]*s3 + idiag[17]*s4 + idiag[22]*s5;
375: x[i2+3] = idiag[3]*s1 + idiag[8]*s2 + idiag[13]*s3 + idiag[18]*s4 + idiag[23]*s5;
376: x[i2+4] = idiag[4]*s1 + idiag[9]*s2 + idiag[14]*s3 + idiag[19]*s4 + idiag[24]*s5;
377: idiag += 25;
378: i2 += 5;
379: }
380: /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
381: PetscLogFlops(25.0*(a->nz));
382: }
383: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
384: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
385: i2 = 0;
386: mdiag = a->idiag+25*a->mbs;
387: for (i=0; i<m; i++) {
388: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3]; x5 = x[i2+4];
389: x[i2] = mdiag[0]*x1 + mdiag[5]*x2 + mdiag[10]*x3 + mdiag[15]*x4 + mdiag[20]*x5;
390: x[i2+1] = mdiag[1]*x1 + mdiag[6]*x2 + mdiag[11]*x3 + mdiag[16]*x4 + mdiag[21]*x5;
391: x[i2+2] = mdiag[2]*x1 + mdiag[7]*x2 + mdiag[12]*x3 + mdiag[17]*x4 + mdiag[22]*x5;
392: x[i2+3] = mdiag[3]*x1 + mdiag[8]*x2 + mdiag[13]*x3 + mdiag[18]*x4 + mdiag[23]*x5;
393: x[i2+4] = mdiag[4]*x1 + mdiag[9]*x2 + mdiag[14]*x3 + mdiag[19]*x4 + mdiag[24]*x5;
394: mdiag += 25;
395: i2 += 5;
396: }
397: PetscLogFlops(45.0*m);
398: } else if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
399: PetscMemcpy(x,b,A->rmap->N*sizeof(PetscScalar));
400: }
401: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
402: idiag = a->idiag+25*a->mbs - 25;
403: i2 = 5*m - 5;
404: x1 = x[i2]; x2 = x[i2+1]; x3 = x[i2+2]; x4 = x[i2+3]; x5 = x[i2+4];
405: x[i2] = idiag[0]*x1 + idiag[5]*x2 + idiag[10]*x3 + idiag[15]*x4 + idiag[20]*x5;
406: x[i2+1] = idiag[1]*x1 + idiag[6]*x2 + idiag[11]*x3 + idiag[16]*x4 + idiag[21]*x5;
407: x[i2+2] = idiag[2]*x1 + idiag[7]*x2 + idiag[12]*x3 + idiag[17]*x4 + idiag[22]*x5;
408: x[i2+3] = idiag[3]*x1 + idiag[8]*x2 + idiag[13]*x3 + idiag[18]*x4 + idiag[23]*x5;
409: x[i2+4] = idiag[4]*x1 + idiag[9]*x2 + idiag[14]*x3 + idiag[19]*x4 + idiag[24]*x5;
410: idiag -= 25;
411: i2 -= 5;
412: for (i=m-2; i>=0; i--) {
413: v1 = v10 + 5*(ai[i+1]-1);
414: v2 = v20 + 5*(ai[i+1]-1);
415: v3 = v30 + 5*(ai[i+1]-1);
416: v4 = v40 + 5*(ai[i+1]-1);
417: v5 = v50 + 5*(ai[i+1]-1);
418: vi = aj + ai[i+1] - 1;
419: nz = ai[i+1] - diag[i] - 1;
420: s1 = x[i2]; s2 = x[i2+1]; s3 = x[i2+2]; s4 = x[i2+3]; s5 = x[i2+4];
421: while (nz--) {
422: idx = 5*(*vi--);
423: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
424: s1 -= v1[4]*x5 + v1[3]*x4 + v1[2]*x3 + v1[1]*x2 + v1[0]*x1;
425: s2 -= v2[4]*x5 + v2[3]*x4 + v2[2]*x3 + v2[1]*x2 + v2[0]*x1;
426: s3 -= v3[4]*x5 + v3[3]*x4 + v3[2]*x3 + v3[1]*x2 + v3[0]*x1;
427: s4 -= v4[4]*x5 + v4[3]*x4 + v4[2]*x3 + v4[1]*x2 + v4[0]*x1;
428: s5 -= v5[4]*x5 + v5[3]*x4 + v5[2]*x3 + v5[1]*x2 + v5[0]*x1;
429: v1 -= 5; v2 -= 5; v3 -= 5; v4 -= 5; v5 -= 5;
430: }
431: x[i2] = idiag[0]*s1 + idiag[5]*s2 + idiag[10]*s3 + idiag[15]*s4 + idiag[20]*s5;
432: x[i2+1] = idiag[1]*s1 + idiag[6]*s2 + idiag[11]*s3 + idiag[16]*s4 + idiag[21]*s5;
433: x[i2+2] = idiag[2]*s1 + idiag[7]*s2 + idiag[12]*s3 + idiag[17]*s4 + idiag[22]*s5;
434: x[i2+3] = idiag[3]*s1 + idiag[8]*s2 + idiag[13]*s3 + idiag[18]*s4 + idiag[23]*s5;
435: x[i2+4] = idiag[4]*s1 + idiag[9]*s2 + idiag[14]*s3 + idiag[19]*s4 + idiag[24]*s5;
436: idiag -= 25;
437: i2 -= 5;
438: }
439: PetscLogFlops(25.0*(a->nz));
440: }
441: } else {
442: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only supports point block SOR with zero initial guess");
443: }
444: VecRestoreArray(xx,&x);
445: VecRestoreArray(bb,(PetscScalar**)&b);
446: return(0);
447: }
448: /*=========================================================*/
449: /*=========================================================*/
452: PetscErrorCode MatMult_SeqBSTRM_4(Mat A,Vec xx,Vec zz)
453: {
454: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
455: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
456: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
457: const PetscScalar *x,*xb;
458: const MatScalar *v1, *v2, *v3, *v4;
459: PetscErrorCode ierr;
460: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
461: PetscBool usecprow=a->compressedrow.use;
462: PetscInt slen;
465: VecGetArray(xx,(PetscScalar**)&x);
466: VecGetArray(zz,&zarray);
468: idx = a->j;
470: if (usecprow){
471: mbs = a->compressedrow.nrows;
472: ii = a->compressedrow.i;
473: ridx = a->compressedrow.rindex;
474: } else {
475: mbs = a->mbs;
476: ii = a->i;
477: z = zarray;
478: }
479:
480: slen = 4*(ii[mbs]-ii[0]);
481: v1 = bstrm->as;
482: v2 = v1 + slen;
483: v3 = v2 + slen;
484: v4 = v3 + slen;
486: for (i=0; i<mbs; i++) {
487: n = ii[1] - ii[0]; ii++;
488: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
489: nonzerorow += (n>0);
490: for (j=0; j<n; j++) {
491: xb = x + 4*(*idx++);
492: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
493: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
494: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
495: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
496: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
497: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
498: }
499: if (usecprow) z = zarray + 4*ridx[i];
500: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
501: if (!usecprow) z += 4;
502: }
503: VecRestoreArray(xx,(PetscScalar**)&x);
504: VecRestoreArray(zz,&zarray);
505: PetscLogFlops(32*a->nz - 4*nonzerorow);
506: return(0);
507: }
508: /*=========================================================*/
511: PetscErrorCode MatMult_SeqBSTRM_5(Mat A,Vec xx,Vec zz)
512: {
513: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
514: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
515: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
516: const PetscScalar *x,*xb;
517: const MatScalar *v1, *v2, *v3, *v4, *v5;
518: PetscErrorCode ierr;
519: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
520: PetscBool usecprow=a->compressedrow.use;
521: PetscInt slen;
524: VecGetArray(xx,(PetscScalar**)&x);
525: VecGetArray(zz,&zarray);
527: idx = a->j;
529: if (usecprow){
530: mbs = a->compressedrow.nrows;
531: ii = a->compressedrow.i;
532: ridx = a->compressedrow.rindex;
533: } else {
534: mbs = a->mbs;
535: ii = a->i;
536: z = zarray;
537: }
538:
539: slen = 5*(ii[mbs]-ii[0]);
540: v1 = bstrm->as;
541: v2 = v1 + slen;
542: v3 = v2 + slen;
543: v4 = v3 + slen;
544: v5 = v4 + slen;
546: for (i=0; i<mbs; i++) {
547: n = ii[1] - ii[0]; ii++;
548: sum1 = sum2 = sum3 = sum4 = sum5 = 0.0;
549: nonzerorow += (n>0);
550: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
551: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
552: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
553: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
554: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
555: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558: for (j=0; j<n; j++) {
559: xb = x + 5*(*idx++);
560: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
561: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
562: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
563: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
564: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
565: sum5 += v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
566: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
567: }
568: if (usecprow) z = zarray + 5*ridx[i];
569: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
570: if (!usecprow) z += 5;
571: }
572: VecRestoreArray(xx,(PetscScalar**)&x);
573: VecRestoreArray(zz,&zarray);
574: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
575: return(0);
576: }
577: /*=========================================================*/
579: /*=========================================================*/
582: PetscErrorCode MatMultTranspose_SeqBSTRM_4(Mat A,Vec xx,Vec zz)
583: {
584: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
585: Mat_SeqBSTRM *sbstrm = (Mat_SeqBSTRM *)A->spptr;
586: PetscScalar zero = 0.0;
587: PetscScalar x1,x2,x3,x4;
588: PetscScalar *x, *xb, *z;
589: MatScalar *v1, *v2, *v3, *v4;
590: PetscErrorCode ierr;
591: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j;
592: PetscInt nonzerorow=0;
593: PetscInt slen;
596: VecSet(zz,zero);
597: VecGetArray(xx,&x);
598: VecGetArray(zz,&z);
600: slen = 4*(ai[mbs]-ai[0]);
601: v1 = sbstrm->as;
602: v2 = v1 + slen;
603: v3 = v2 + slen;
604: v4 = v3 + slen;
605: xb = x;
607: for (i=0; i<mbs; i++) {
608: n = ai[i+1] - ai[i];
609: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; xb += 4;
610: nonzerorow += (n>0);
611: ib = aj + ai[i];
612:
613: for (j=0; j<n; j++) {
614: cval = ib[j]*4;
615: z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4;
616: z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4;
617: z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4;
618: z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4;
619: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
620: }
621:
622: }
623: VecRestoreArray(xx,&x);
624: VecRestoreArray(zz,&z);
625: PetscLogFlops(32*a->nz - 4*nonzerorow);
626: return(0);
627: }
628: /*=========================================================*/
631: PetscErrorCode MatMultTranspose_SeqBSTRM_5(Mat A,Vec xx,Vec zz)
632: {
633: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
634: Mat_SeqBSTRM *sbstrm = (Mat_SeqBSTRM *)A->spptr;
635: PetscScalar zero = 0.0;
636: PetscScalar *z = 0;
637: PetscScalar *x,*xb;
638: const MatScalar *v1, *v2, *v3, *v4, *v5;
639: PetscScalar x1, x2, x3, x4, x5;
640: PetscErrorCode ierr;
641: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j;
642: PetscInt nonzerorow=0;
643: PetscInt slen;
646: VecSet(zz,zero);
647: VecGetArray(xx,&x);
648: VecGetArray(zz,&z);
650: slen = 5*(ai[mbs]-ai[0]);
652: v1 = sbstrm->as;
653: v2 = v1 + slen;
654: v3 = v2 + slen;
655: v4 = v3 + slen;
656: v5 = v4 + slen;
658: xb = x;
660: for (i=0; i<mbs; i++) {
661: n = ai[i+1] - ai[i];
662: nonzerorow += (n>0);
664: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; xb += 5;
665: ib = aj + ai[i];
667: PetscPrefetchBlock(ib+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
668: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
669: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
670: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
671: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
672: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
674:
675: for (j=0; j<n; j++) {
676: cval = ib[j]*5;
677: z[cval] += v1[0]*x1 + v2[0]*x2 + v3[0]*x3 + v4[0]*x4 + v5[0]*x5;
678: z[cval+1] += v1[1]*x1 + v2[1]*x2 + v3[1]*x3 + v4[1]*x4 + v5[1]*x5;
679: z[cval+2] += v1[2]*x1 + v2[2]*x2 + v3[2]*x3 + v4[2]*x4 + v5[2]*x5;
680: z[cval+3] += v1[3]*x1 + v2[3]*x2 + v3[3]*x3 + v4[3]*x4 + v5[3]*x5;
681: z[cval+4] += v1[4]*x1 + v2[4]*x2 + v3[4]*x3 + v4[4]*x4 + v5[4]*x5;
682: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
683: }
684: }
685: VecRestoreArray(xx,&x);
686: VecRestoreArray(zz,&z);
687: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
688: return(0);
689: }
690: /*=========================================================*/
693: PetscErrorCode MatMultAdd_SeqBSTRM_4(Mat A,Vec xx,Vec yy,Vec zz)
694: {
695: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
696: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
697: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
698: MatScalar *v1, *v2, *v3, *v4;
700: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
701: PetscBool usecprow=a->compressedrow.use;
702: PetscInt slen;
705: VecGetArray(xx,&x);
706: VecGetArray(yy,&yarray);
707: if (zz != yy) {
708: VecGetArray(zz,&zarray);
709: } else {
710: zarray = yarray;
711: }
713: idx = a->j;
714: if (usecprow){
715: if (zz != yy){
716: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
717: }
718: mbs = a->compressedrow.nrows;
719: ii = a->compressedrow.i;
720: ridx = a->compressedrow.rindex;
721: } else {
722: ii = a->i;
723: y = yarray;
724: z = zarray;
725: }
727: slen = 4*(ii[mbs]-ii[0]);
728: v1 = bstrm->as;
729: v2 = v1 + slen;
730: v3 = v2 + slen;
731: v4 = v3 + slen;
733: for (i=0; i<mbs; i++) {
734: n = ii[1] - ii[0]; ii++;
735: if (usecprow){
736: z = zarray + 4*ridx[i];
737: y = yarray + 4*ridx[i];
738: }
739: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
740: for (j=0; j<n; j++) {
741: xb = x + 4*(*idx++);
742: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
743: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4;
744: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4;
745: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4;
746: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4;
747: v1 += 4; v2 += 4; v3 += 4; v4 += 4;
748: }
749: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
750: if (!usecprow){
751: z += 4; y += 4;
752: }
753: }
754: VecRestoreArray(xx,&x);
755: VecRestoreArray(yy,&yarray);
756: if (zz != yy) {
757: VecRestoreArray(zz,&zarray);
758: }
759: PetscLogFlops(32.0*a->nz);
760: return(0);
761: }
763: /*=========================================================*/
766: PetscErrorCode MatMultAdd_SeqBSTRM_5(Mat A,Vec xx,Vec yy,Vec zz)
767: {
768: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
769: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM *)A->spptr;
770: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
771: PetscScalar *yarray,*zarray;
772: MatScalar *v1,*v2,*v3,*v4,*v5;
774: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
775: PetscBool usecprow=a->compressedrow.use;
776: PetscInt slen;
779: VecGetArray(xx,&x);
780: VecGetArray(yy,&yarray);
781: if (zz != yy) {
782: VecGetArray(zz,&zarray);
783: } else {
784: zarray = yarray;
785: }
788: idx = a->j;
789: if (usecprow){
790: if (zz != yy){
791: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
792: }
793: mbs = a->compressedrow.nrows;
794: ii = a->compressedrow.i;
795: ridx = a->compressedrow.rindex;
796: } else {
797: ii = a->i;
798: y = yarray;
799: z = zarray;
800: }
802: slen = 5*(ii[mbs]-ii[0]);
803: v1 = bstrm->as;
804: v2 = v1 + slen;
805: v3 = v2 + slen;
806: v4 = v3 + slen;
807: v5 = v4 + slen;
810: for (i=0; i<mbs; i++) {
811: n = ii[1] - ii[0]; ii++;
812: if (usecprow){
813: z = zarray + 5*ridx[i];
814: y = yarray + 5*ridx[i];
815: }
816: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
817: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
818: PetscPrefetchBlock(v1+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
819: PetscPrefetchBlock(v2+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
820: PetscPrefetchBlock(v3+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
821: PetscPrefetchBlock(v4+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
822: PetscPrefetchBlock(v5+5*n,5*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
825: for (j=0; j<n; j++) {
826: xb = x + 5*(*idx++);
827: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
828: sum1 += v1[0]*x1 + v1[1]*x2 + v1[2]*x3 + v1[3]*x4 + v1[4]*x5;
829: sum2 += v2[0]*x1 + v2[1]*x2 + v2[2]*x3 + v2[3]*x4 + v2[4]*x5;
830: sum3 += v3[0]*x1 + v3[1]*x2 + v3[2]*x3 + v3[3]*x4 + v3[4]*x5;
831: sum4 += v4[0]*x1 + v4[1]*x2 + v4[2]*x3 + v4[3]*x4 + v4[4]*x5;
832: sum5 += v5[0]*x1 + v5[1]*x2 + v5[2]*x3 + v5[3]*x4 + v5[4]*x5;
833: v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5;
834: }
835: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
836: if (!usecprow){
837: z += 5; y += 5;
838: }
839: }
840: VecRestoreArray(xx,&x);
841: VecRestoreArray(yy,&yarray);
842: if (zz != yy) {
843: VecRestoreArray(zz,&zarray);
844: }
845: PetscLogFlops(50.0*a->nz);
846: return(0);
847: }
849: /*=========================================================*/
852: PetscErrorCode MatSeqBSTRM_create_bstrm(Mat A)
853: {
854: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
855: Mat_SeqBSTRM *bstrm = (Mat_SeqBSTRM*) A->spptr;
856: PetscInt MROW = a->mbs, bs = A->rmap->bs;
857: PetscInt *ai = a->i;
858: PetscInt i,j,k;
859: MatScalar *aa = a->a;
861: PetscInt bs2, rbs, cbs, blen, slen;
862: PetscScalar **asp;
865: bstrm->rbs = bstrm->cbs = bs;
867: rbs = cbs = bs;
868: bs2 = rbs*cbs;
869: blen = ai[MROW]-ai[0];
870: slen = blen*cbs;
872: PetscMalloc(bs2*blen*sizeof(PetscScalar), &bstrm->as);
874: PetscMalloc(rbs*sizeof(PetscScalar *), &asp);
875:
876: for(i=0;i<rbs;i++) asp[i] = bstrm->as + i*slen;
877:
878: for (k=0; k<blen; k++) {
879: for (j=0; j<cbs; j++)
880: for (i=0; i<rbs; i++)
881: asp[i][k*cbs+j] = aa[k*bs2+j*rbs+i];
882: }
884: PetscFree(asp);
886: switch (bs){
887: case 4:
888: A->ops->mult = MatMult_SeqBSTRM_4;
889: A->ops->multadd = MatMultAdd_SeqBSTRM_4;
890: A->ops->multtranspose = MatMultTranspose_SeqBSTRM_4;
891: A->ops->sor = MatSOR_SeqBSTRM_4;
892: break;
893: case 5:
894: A->ops->mult = MatMult_SeqBSTRM_5;
895: A->ops->multadd = MatMultAdd_SeqBSTRM_5;
896: A->ops->multtranspose = MatMultTranspose_SeqBSTRM_5;
897: A->ops->sor = MatSOR_SeqBSTRM_5;
898: break;
899: default:
900: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"not supported for block size %D",bs);
901: }
902: return(0);
903: }
904: /*=========================================================*/
905: /*=========================================================*/
906: EXTERN_C_BEGIN
909: PetscErrorCode MatSeqBSTRMTransform(Mat A)
910: {
912: MatSeqBSTRM_convert_bstrm(A);
913: return(0);
914: }
915: EXTERN_C_END