Actual source code: baij2.c
petsc-3.3-p7 2013-05-11
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
14: const PetscInt *idx;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,&table);
27: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
28: PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
34:
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n ; ++j){
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
47:
48: k = 0;
49: for (j=0; j<ov; j++){ /* for each overlap*/
50: n = isz;
51: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end ; l++){
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++)
64: nidx2[j*bs+k] = nidx[j]*bs+k;
65: }
66: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
67: }
68: PetscBTDestroy(&table);
69: PetscFree(nidx);
70: PetscFree(nidx2);
71: return(0);
72: }
76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: const PetscInt *irow,*icol;
83: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84: PetscInt *aj = a->j,*ai = a->i;
85: MatScalar *mat_a;
86: Mat C;
87: PetscBool flag,sorted;
90: ISSorted(iscol,&sorted);
91: if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
93: ISGetIndices(isrow,&irow);
94: ISGetIndices(iscol,&icol);
95: ISGetLocalSize(isrow,&nrows);
96: ISGetLocalSize(iscol,&ncols);
98: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
99: ssmap = smap;
100: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
101: PetscMemzero(smap,oldcols*sizeof(PetscInt));
102: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
103: /* determine lens of each row */
104: for (i=0; i<nrows; i++) {
105: kstart = ai[irow[i]];
106: kend = kstart + a->ilen[irow[i]];
107: lens[i] = 0;
108: for (k=kstart; k<kend; k++) {
109: if (ssmap[aj[k]]) {
110: lens[i]++;
111: }
112: }
113: }
114: /* Create and fill new matrix */
115: if (scall == MAT_REUSE_MATRIX) {
116: c = (Mat_SeqBAIJ *)((*B)->data);
118: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
119: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
120: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
122: C = *B;
123: } else {
124: MatCreate(((PetscObject)A)->comm,&C);
125: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
126: MatSetType(C,((PetscObject)A)->type_name);
127: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
128: }
129: c = (Mat_SeqBAIJ *)(C->data);
130: for (i=0; i<nrows; i++) {
131: row = irow[i];
132: kstart = ai[row];
133: kend = kstart + a->ilen[row];
134: mat_i = c->i[i];
135: mat_j = c->j + mat_i;
136: mat_a = c->a + mat_i*bs2;
137: mat_ilen = c->ilen + i;
138: for (k=kstart; k<kend; k++) {
139: if ((tcol=ssmap[a->j[k]])) {
140: *mat_j++ = tcol - 1;
141: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
142: mat_a += bs2;
143: (*mat_ilen)++;
144: }
145: }
146: }
147:
148: /* Free work space */
149: ISRestoreIndices(iscol,&icol);
150: PetscFree(smap);
151: PetscFree(lens);
152: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
153: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
154:
155: ISRestoreIndices(isrow,&irow);
156: *B = C;
157: return(0);
158: }
162: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
163: {
164: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
165: IS is1,is2;
167: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
168: const PetscInt *irow,*icol;
171: ISGetIndices(isrow,&irow);
172: ISGetIndices(iscol,&icol);
173: ISGetLocalSize(isrow,&nrows);
174: ISGetLocalSize(iscol,&ncols);
175:
176: /* Verify if the indices corespond to each element in a block
177: and form the IS with compressed IS */
178: PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
179: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
180: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
181: count = 0;
182: for (i=0; i<a->mbs; i++) {
183: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
184: if (vary[i]==bs) iary[count++] = i;
185: }
186: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
187:
188: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
189: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
190: count = 0;
191: for (i=0; i<a->mbs; i++) {
192: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
193: if (vary[i]==bs) iary[count++] = i;
194: }
195: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
196: ISRestoreIndices(isrow,&irow);
197: ISRestoreIndices(iscol,&icol);
198: PetscFree2(vary,iary);
200: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
201: ISDestroy(&is1);
202: ISDestroy(&is2);
203: return(0);
204: }
208: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
209: {
211: PetscInt i;
214: if (scall == MAT_INITIAL_MATRIX) {
215: PetscMalloc((n+1)*sizeof(Mat),B);
216: }
218: for (i=0; i<n; i++) {
219: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
220: }
221: return(0);
222: }
225: /* -------------------------------------------------------*/
226: /* Should check that shapes of vectors and matrices match */
227: /* -------------------------------------------------------*/
231: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
232: {
233: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
234: PetscScalar *z,sum;
235: const PetscScalar *x;
236: const MatScalar *v;
237: PetscErrorCode ierr;
238: PetscInt mbs,i,n,nonzerorow=0;
239: const PetscInt *idx,*ii,*ridx=PETSC_NULL;
240: PetscBool usecprow=a->compressedrow.use;
243: VecGetArrayRead(xx,&x);
244: VecGetArray(zz,&z);
246: if (usecprow){
247: mbs = a->compressedrow.nrows;
248: ii = a->compressedrow.i;
249: ridx = a->compressedrow.rindex;
250: PetscMemzero(z,mbs*sizeof(PetscScalar));
251: } else {
252: mbs = a->mbs;
253: ii = a->i;
254: }
256: for (i=0; i<mbs; i++) {
257: n = ii[1] - ii[0];
258: v = a->a + ii[0];
259: idx = a->j + ii[0];
260: ii++;
261: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
262: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
263: sum = 0.0;
264: PetscSparseDensePlusDot(sum,x,v,idx,n);
265: if (usecprow){
266: z[ridx[i]] = sum;
267: } else {
268: nonzerorow += (n>0);
269: z[i] = sum;
270: }
271: }
272: VecRestoreArrayRead(xx,&x);
273: VecRestoreArray(zz,&z);
274: PetscLogFlops(2.0*a->nz - nonzerorow);
275: return(0);
276: }
280: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
281: {
282: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
283: PetscScalar *z = 0,sum1,sum2,*zarray;
284: const PetscScalar *x,*xb;
285: PetscScalar x1,x2;
286: const MatScalar *v;
287: PetscErrorCode ierr;
288: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
289: PetscBool usecprow=a->compressedrow.use;
292: VecGetArrayRead(xx,&x);
293: VecGetArray(zz,&zarray);
295: idx = a->j;
296: v = a->a;
297: if (usecprow){
298: mbs = a->compressedrow.nrows;
299: ii = a->compressedrow.i;
300: ridx = a->compressedrow.rindex;
301: } else {
302: mbs = a->mbs;
303: ii = a->i;
304: z = zarray;
305: }
307: for (i=0; i<mbs; i++) {
308: n = ii[1] - ii[0]; ii++;
309: sum1 = 0.0; sum2 = 0.0;
310: nonzerorow += (n>0);
311: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
312: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
313: for (j=0; j<n; j++) {
314: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
315: sum1 += v[0]*x1 + v[2]*x2;
316: sum2 += v[1]*x1 + v[3]*x2;
317: v += 4;
318: }
319: if (usecprow) z = zarray + 2*ridx[i];
320: z[0] = sum1; z[1] = sum2;
321: if (!usecprow) z += 2;
322: }
323: VecRestoreArrayRead(xx,&x);
324: VecRestoreArray(zz,&zarray);
325: PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);
326: return(0);
327: }
331: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
332: {
333: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
334: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
335: const PetscScalar *x,*xb;
336: const MatScalar *v;
337: PetscErrorCode ierr;
338: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
339: PetscBool usecprow=a->compressedrow.use;
340:
342: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
343: #pragma disjoint(*v,*z,*xb)
344: #endif
347: VecGetArrayRead(xx,&x);
348: VecGetArray(zz,&zarray);
350: idx = a->j;
351: v = a->a;
352: if (usecprow){
353: mbs = a->compressedrow.nrows;
354: ii = a->compressedrow.i;
355: ridx = a->compressedrow.rindex;
356: } else {
357: mbs = a->mbs;
358: ii = a->i;
359: z = zarray;
360: }
362: for (i=0; i<mbs; i++) {
363: n = ii[1] - ii[0]; ii++;
364: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
365: nonzerorow += (n>0);
366: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
367: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
368: for (j=0; j<n; j++) {
369: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
370: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
371: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
372: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
373: v += 9;
374: }
375: if (usecprow) z = zarray + 3*ridx[i];
376: z[0] = sum1; z[1] = sum2; z[2] = sum3;
377: if (!usecprow) z += 3;
378: }
379: VecRestoreArrayRead(xx,&x);
380: VecRestoreArray(zz,&zarray);
381: PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);
382: return(0);
383: }
387: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
388: {
389: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
390: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
391: const PetscScalar *x,*xb;
392: const MatScalar *v;
393: PetscErrorCode ierr;
394: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
395: PetscBool usecprow=a->compressedrow.use;
398: VecGetArrayRead(xx,&x);
399: VecGetArray(zz,&zarray);
401: idx = a->j;
402: v = a->a;
403: if (usecprow){
404: mbs = a->compressedrow.nrows;
405: ii = a->compressedrow.i;
406: ridx = a->compressedrow.rindex;
407: } else {
408: mbs = a->mbs;
409: ii = a->i;
410: z = zarray;
411: }
413: for (i=0; i<mbs; i++) {
414: n = ii[1] - ii[0]; ii++;
415: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
416: nonzerorow += (n>0);
417: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
418: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
419: for (j=0; j<n; j++) {
420: xb = x + 4*(*idx++);
421: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
422: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
423: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
424: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
425: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
426: v += 16;
427: }
428: if (usecprow) z = zarray + 4*ridx[i];
429: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
430: if (!usecprow) z += 4;
431: }
432: VecRestoreArrayRead(xx,&x);
433: VecRestoreArray(zz,&zarray);
434: PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);
435: return(0);
436: }
440: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
441: {
442: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
443: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
444: const PetscScalar *xb,*x;
445: const MatScalar *v;
446: PetscErrorCode ierr;
447: const PetscInt *idx,*ii,*ridx=PETSC_NULL;
448: PetscInt mbs,i,j,n,nonzerorow=0;
449: PetscBool usecprow=a->compressedrow.use;
452: VecGetArrayRead(xx,&x);
453: VecGetArray(zz,&zarray);
455: idx = a->j;
456: v = a->a;
457: if (usecprow){
458: mbs = a->compressedrow.nrows;
459: ii = a->compressedrow.i;
460: ridx = a->compressedrow.rindex;
461: } else {
462: mbs = a->mbs;
463: ii = a->i;
464: z = zarray;
465: }
467: for (i=0; i<mbs; i++) {
468: n = ii[1] - ii[0]; ii++;
469: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
470: nonzerorow += (n>0);
471: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
472: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
473: for (j=0; j<n; j++) {
474: xb = x + 5*(*idx++);
475: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
476: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
477: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
478: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
479: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
480: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
481: v += 25;
482: }
483: if (usecprow) z = zarray + 5*ridx[i];
484: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
485: if (!usecprow) z += 5;
486: }
487: VecRestoreArrayRead(xx,&x);
488: VecRestoreArray(zz,&zarray);
489: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
490: return(0);
491: }
496: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
497: {
498: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
499: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
500: const PetscScalar *x,*xb;
501: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
502: const MatScalar *v;
503: PetscErrorCode ierr;
504: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
505: PetscBool usecprow=a->compressedrow.use;
508: VecGetArrayRead(xx,&x);
509: VecGetArray(zz,&zarray);
511: idx = a->j;
512: v = a->a;
513: if (usecprow){
514: mbs = a->compressedrow.nrows;
515: ii = a->compressedrow.i;
516: ridx = a->compressedrow.rindex;
517: } else {
518: mbs = a->mbs;
519: ii = a->i;
520: z = zarray;
521: }
523: for (i=0; i<mbs; i++) {
524: n = ii[1] - ii[0]; ii++;
525: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
526: nonzerorow += (n>0);
527: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
528: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
529: for (j=0; j<n; j++) {
530: xb = x + 6*(*idx++);
531: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
532: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
533: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
534: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
535: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
536: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
537: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
538: v += 36;
539: }
540: if (usecprow) z = zarray + 6*ridx[i];
541: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
542: if (!usecprow) z += 6;
543: }
545: VecRestoreArrayRead(xx,&x);
546: VecRestoreArray(zz,&zarray);
547: PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);
548: return(0);
549: }
553: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
554: {
555: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
556: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
557: const PetscScalar *x,*xb;
558: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
559: const MatScalar *v;
560: PetscErrorCode ierr;
561: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
562: PetscBool usecprow=a->compressedrow.use;
565: VecGetArrayRead(xx,&x);
566: VecGetArray(zz,&zarray);
568: idx = a->j;
569: v = a->a;
570: if (usecprow){
571: mbs = a->compressedrow.nrows;
572: ii = a->compressedrow.i;
573: ridx = a->compressedrow.rindex;
574: } else {
575: mbs = a->mbs;
576: ii = a->i;
577: z = zarray;
578: }
580: for (i=0; i<mbs; i++) {
581: n = ii[1] - ii[0]; ii++;
582: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
583: nonzerorow += (n>0);
584: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
585: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
586: for (j=0; j<n; j++) {
587: xb = x + 7*(*idx++);
588: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
589: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
590: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
591: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
592: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
593: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
594: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
595: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
596: v += 49;
597: }
598: if (usecprow) z = zarray + 7*ridx[i];
599: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
600: if (!usecprow) z += 7;
601: }
603: VecRestoreArrayRead(xx,&x);
604: VecRestoreArray(zz,&zarray);
605: PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);
606: return(0);
607: }
609: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
610: /* Default MatMult for block size 15 */
614: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
615: {
616: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
617: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
618: const PetscScalar *x,*xb;
619: PetscScalar *zarray,xv;
620: const MatScalar *v;
621: PetscErrorCode ierr;
622: const PetscInt *ii,*ij=a->j,*idx;
623: PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
624: PetscBool usecprow=a->compressedrow.use;
627: VecGetArrayRead(xx,&x);
628: VecGetArray(zz,&zarray);
630: v = a->a;
631: if (usecprow){
632: mbs = a->compressedrow.nrows;
633: ii = a->compressedrow.i;
634: ridx = a->compressedrow.rindex;
635: } else {
636: mbs = a->mbs;
637: ii = a->i;
638: z = zarray;
639: }
641: for (i=0; i<mbs; i++) {
642: n = ii[i+1] - ii[i];
643: idx = ij + ii[i];
644: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
645: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
647: nonzerorow += (n>0);
648: for (j=0; j<n; j++) {
649: xb = x + 15*(idx[j]);
651: for(k=0;k<15;k++){
652: xv = xb[k];
653: sum1 += v[0]*xv;
654: sum2 += v[1]*xv;
655: sum3 += v[2]*xv;
656: sum4 += v[3]*xv;
657: sum5 += v[4]*xv;
658: sum6 += v[5]*xv;
659: sum7 += v[6]*xv;
660: sum8 += v[7]*xv;
661: sum9 += v[8]*xv;
662: sum10 += v[9]*xv;
663: sum11 += v[10]*xv;
664: sum12 += v[11]*xv;
665: sum13 += v[12]*xv;
666: sum14 += v[13]*xv;
667: sum15 += v[14]*xv;
668: v += 15;
669: }
670: }
671: if (usecprow) z = zarray + 15*ridx[i];
672: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
673: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
675: if (!usecprow) z += 15;
676: }
678: VecRestoreArrayRead(xx,&x);
679: VecRestoreArray(zz,&zarray);
680: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
681: return(0);
682: }
684: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
687: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
688: {
689: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
690: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
691: const PetscScalar *x,*xb;
692: PetscScalar x1,x2,x3,x4,*zarray;
693: const MatScalar *v;
694: PetscErrorCode ierr;
695: const PetscInt *ii,*ij=a->j,*idx;
696: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
697: PetscBool usecprow=a->compressedrow.use;
700: VecGetArrayRead(xx,&x);
701: VecGetArray(zz,&zarray);
703: v = a->a;
704: if (usecprow){
705: mbs = a->compressedrow.nrows;
706: ii = a->compressedrow.i;
707: ridx = a->compressedrow.rindex;
708: } else {
709: mbs = a->mbs;
710: ii = a->i;
711: z = zarray;
712: }
714: for (i=0; i<mbs; i++) {
715: n = ii[i+1] - ii[i];
716: idx = ij + ii[i];
717: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
718: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
720: nonzerorow += (n>0);
721: for (j=0; j<n; j++) {
722: xb = x + 15*(idx[j]);
723: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
725: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
726: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
727: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
728: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
729: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
730: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
731: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
732: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
733: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
734: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
735: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
736: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
737: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
738: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
739: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
741: v += 60;
743: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
745: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
746: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
747: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
748: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
749: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
750: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
751: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
752: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
753: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
754: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
755: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
756: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
757: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
758: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
759: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
760: v += 60;
762: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
763: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
764: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
765: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
766: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
767: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
768: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
769: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
770: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
771: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
772: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
773: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
774: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
775: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
776: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
777: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
778: v += 60;
780: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
781: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
782: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
783: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
784: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
785: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
786: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
787: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
788: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
789: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
790: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
791: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
792: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
793: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
794: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
795: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
796: v += 45;
797: }
798: if (usecprow) z = zarray + 15*ridx[i];
799: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
800: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
802: if (!usecprow) z += 15;
803: }
805: VecRestoreArrayRead(xx,&x);
806: VecRestoreArray(zz,&zarray);
807: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
808: return(0);
809: }
811: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
814: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
815: {
816: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
817: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
818: const PetscScalar *x,*xb;
819: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
820: const MatScalar *v;
821: PetscErrorCode ierr;
822: const PetscInt *ii,*ij=a->j,*idx;
823: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
824: PetscBool usecprow=a->compressedrow.use;
827: VecGetArrayRead(xx,&x);
828: VecGetArray(zz,&zarray);
830: v = a->a;
831: if (usecprow){
832: mbs = a->compressedrow.nrows;
833: ii = a->compressedrow.i;
834: ridx = a->compressedrow.rindex;
835: } else {
836: mbs = a->mbs;
837: ii = a->i;
838: z = zarray;
839: }
841: for (i=0; i<mbs; i++) {
842: n = ii[i+1] - ii[i];
843: idx = ij + ii[i];
844: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
845: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
847: nonzerorow += (n>0);
848: for (j=0; j<n; j++) {
849: xb = x + 15*(idx[j]);
850: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
851: x8 = xb[7];
853: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
854: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
855: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
856: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
857: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
858: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
859: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
860: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
861: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
862: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
863: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
864: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
865: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
866: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
867: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
868: v += 120;
870: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
872: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
873: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
874: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
875: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
876: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
877: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
878: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
879: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
880: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
881: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
882: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
883: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
884: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
885: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
886: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
887: v += 105;
888: }
889: if (usecprow) z = zarray + 15*ridx[i];
890: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
891: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
893: if (!usecprow) z += 15;
894: }
896: VecRestoreArrayRead(xx,&x);
897: VecRestoreArray(zz,&zarray);
898: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
899: return(0);
900: }
902: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
906: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
907: {
908: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
909: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
910: const PetscScalar *x,*xb;
911: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
912: const MatScalar *v;
913: PetscErrorCode ierr;
914: const PetscInt *ii,*ij=a->j,*idx;
915: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
916: PetscBool usecprow=a->compressedrow.use;
919: VecGetArrayRead(xx,&x);
920: VecGetArray(zz,&zarray);
922: v = a->a;
923: if (usecprow){
924: mbs = a->compressedrow.nrows;
925: ii = a->compressedrow.i;
926: ridx = a->compressedrow.rindex;
927: } else {
928: mbs = a->mbs;
929: ii = a->i;
930: z = zarray;
931: }
933: for (i=0; i<mbs; i++) {
934: n = ii[i+1] - ii[i];
935: idx = ij + ii[i];
936: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
937: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
939: nonzerorow += (n>0);
940: for (j=0; j<n; j++) {
941: xb = x + 15*(idx[j]);
942: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
943: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
945: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
946: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
947: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
948: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
949: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
950: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
951: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
952: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
953: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
954: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
955: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
956: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
957: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
958: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
959: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
960: v += 225;
961: }
962: if (usecprow) z = zarray + 15*ridx[i];
963: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
964: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
966: if (!usecprow) z += 15;
967: }
969: VecRestoreArrayRead(xx,&x);
970: VecRestoreArray(zz,&zarray);
971: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
972: return(0);
973: }
976: /*
977: This will not work with MatScalar == float because it calls the BLAS
978: */
981: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
982: {
983: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
984: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
985: MatScalar *v;
987: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
988: PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
989: PetscBool usecprow=a->compressedrow.use;
992: VecGetArray(xx,&x);
993: VecGetArray(zz,&zarray);
995: idx = a->j;
996: v = a->a;
997: if (usecprow){
998: mbs = a->compressedrow.nrows;
999: ii = a->compressedrow.i;
1000: ridx = a->compressedrow.rindex;
1001: } else {
1002: mbs = a->mbs;
1003: ii = a->i;
1004: z = zarray;
1005: }
1007: if (!a->mult_work) {
1008: k = PetscMax(A->rmap->n,A->cmap->n);
1009: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1010: }
1011: work = a->mult_work;
1012: for (i=0; i<mbs; i++) {
1013: n = ii[1] - ii[0]; ii++;
1014: ncols = n*bs;
1015: workt = work;
1016: nonzerorow += (n>0);
1017: for (j=0; j<n; j++) {
1018: xb = x + bs*(*idx++);
1019: for (k=0; k<bs; k++) workt[k] = xb[k];
1020: workt += bs;
1021: }
1022: if (usecprow) z = zarray + bs*ridx[i];
1023: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1024: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1025: v += n*bs2;
1026: if (!usecprow) z += bs;
1027: }
1028: VecRestoreArray(xx,&x);
1029: VecRestoreArray(zz,&zarray);
1030: PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);
1031: return(0);
1032: }
1036: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1037: {
1038: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1039: const PetscScalar *x;
1040: PetscScalar *y,*z,sum;
1041: const MatScalar *v;
1042: PetscErrorCode ierr;
1043: PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1044: const PetscInt *idx,*ii;
1045: PetscBool usecprow=a->compressedrow.use;
1048: VecGetArrayRead(xx,&x);
1049: VecGetArray(yy,&y);
1050: if (zz != yy) {
1051: VecGetArray(zz,&z);
1052: } else {
1053: z = y;
1054: }
1056: idx = a->j;
1057: v = a->a;
1058: if (usecprow){
1059: if (zz != yy){
1060: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1061: }
1062: mbs = a->compressedrow.nrows;
1063: ii = a->compressedrow.i;
1064: ridx = a->compressedrow.rindex;
1065: } else {
1066: ii = a->i;
1067: }
1069: for (i=0; i<mbs; i++) {
1070: n = ii[1] - ii[0];
1071: ii++;
1072: if (!usecprow){
1073: nonzerorow += (n>0);
1074: sum = y[i];
1075: } else {
1076: sum = y[ridx[i]];
1077: }
1078: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1079: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1080: PetscSparseDensePlusDot(sum,x,v,idx,n);
1081: v += n;
1082: idx += n;
1083: if (usecprow){
1084: z[ridx[i]] = sum;
1085: } else {
1086: z[i] = sum;
1087: }
1088: }
1089: VecRestoreArrayRead(xx,&x);
1090: VecRestoreArray(yy,&y);
1091: if (zz != yy) {
1092: VecRestoreArray(zz,&z);
1093: }
1094: PetscLogFlops(2.0*a->nz - nonzerorow);
1095: return(0);
1096: }
1100: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1101: {
1102: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1103: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
1104: PetscScalar x1,x2,*yarray,*zarray;
1105: MatScalar *v;
1107: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1108: PetscBool usecprow=a->compressedrow.use;
1111: VecGetArray(xx,&x);
1112: VecGetArray(yy,&yarray);
1113: if (zz != yy) {
1114: VecGetArray(zz,&zarray);
1115: } else {
1116: zarray = yarray;
1117: }
1119: idx = a->j;
1120: v = a->a;
1121: if (usecprow){
1122: if (zz != yy){
1123: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1124: }
1125: mbs = a->compressedrow.nrows;
1126: ii = a->compressedrow.i;
1127: ridx = a->compressedrow.rindex;
1128: if (zz != yy){
1129: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1130: }
1131: } else {
1132: ii = a->i;
1133: y = yarray;
1134: z = zarray;
1135: }
1137: for (i=0; i<mbs; i++) {
1138: n = ii[1] - ii[0]; ii++;
1139: if (usecprow){
1140: z = zarray + 2*ridx[i];
1141: y = yarray + 2*ridx[i];
1142: }
1143: sum1 = y[0]; sum2 = y[1];
1144: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1145: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1146: for (j=0; j<n; j++) {
1147: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1148: sum1 += v[0]*x1 + v[2]*x2;
1149: sum2 += v[1]*x1 + v[3]*x2;
1150: v += 4;
1151: }
1152: z[0] = sum1; z[1] = sum2;
1153: if (!usecprow){
1154: z += 2; y += 2;
1155: }
1156: }
1157: VecRestoreArray(xx,&x);
1158: VecRestoreArray(yy,&yarray);
1159: if (zz != yy) {
1160: VecRestoreArray(zz,&zarray);
1161: }
1162: PetscLogFlops(4.0*a->nz);
1163: return(0);
1164: }
1168: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1169: {
1170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1171: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1172: MatScalar *v;
1174: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1175: PetscBool usecprow=a->compressedrow.use;
1178: VecGetArray(xx,&x);
1179: VecGetArray(yy,&yarray);
1180: if (zz != yy) {
1181: VecGetArray(zz,&zarray);
1182: } else {
1183: zarray = yarray;
1184: }
1186: idx = a->j;
1187: v = a->a;
1188: if (usecprow){
1189: if (zz != yy){
1190: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1191: }
1192: mbs = a->compressedrow.nrows;
1193: ii = a->compressedrow.i;
1194: ridx = a->compressedrow.rindex;
1195: } else {
1196: ii = a->i;
1197: y = yarray;
1198: z = zarray;
1199: }
1201: for (i=0; i<mbs; i++) {
1202: n = ii[1] - ii[0]; ii++;
1203: if (usecprow){
1204: z = zarray + 3*ridx[i];
1205: y = yarray + 3*ridx[i];
1206: }
1207: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1208: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1209: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1210: for (j=0; j<n; j++) {
1211: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1212: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1213: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1214: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1215: v += 9;
1216: }
1217: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1218: if (!usecprow){
1219: z += 3; y += 3;
1220: }
1221: }
1222: VecRestoreArray(xx,&x);
1223: VecRestoreArray(yy,&yarray);
1224: if (zz != yy) {
1225: VecRestoreArray(zz,&zarray);
1226: }
1227: PetscLogFlops(18.0*a->nz);
1228: return(0);
1229: }
1233: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1234: {
1235: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1236: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1237: MatScalar *v;
1239: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1240: PetscBool usecprow=a->compressedrow.use;
1243: VecGetArray(xx,&x);
1244: VecGetArray(yy,&yarray);
1245: if (zz != yy) {
1246: VecGetArray(zz,&zarray);
1247: } else {
1248: zarray = yarray;
1249: }
1251: idx = a->j;
1252: v = a->a;
1253: if (usecprow){
1254: if (zz != yy){
1255: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1256: }
1257: mbs = a->compressedrow.nrows;
1258: ii = a->compressedrow.i;
1259: ridx = a->compressedrow.rindex;
1260: } else {
1261: ii = a->i;
1262: y = yarray;
1263: z = zarray;
1264: }
1266: for (i=0; i<mbs; i++) {
1267: n = ii[1] - ii[0]; ii++;
1268: if (usecprow){
1269: z = zarray + 4*ridx[i];
1270: y = yarray + 4*ridx[i];
1271: }
1272: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1273: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1274: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1275: for (j=0; j<n; j++) {
1276: xb = x + 4*(*idx++);
1277: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1278: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1279: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1280: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1281: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1282: v += 16;
1283: }
1284: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1285: if (!usecprow){
1286: z += 4; y += 4;
1287: }
1288: }
1289: VecRestoreArray(xx,&x);
1290: VecRestoreArray(yy,&yarray);
1291: if (zz != yy) {
1292: VecRestoreArray(zz,&zarray);
1293: }
1294: PetscLogFlops(32.0*a->nz);
1295: return(0);
1296: }
1300: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1301: {
1302: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1303: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1304: PetscScalar *yarray,*zarray;
1305: MatScalar *v;
1307: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1308: PetscBool usecprow=a->compressedrow.use;
1311: VecGetArray(xx,&x);
1312: VecGetArray(yy,&yarray);
1313: if (zz != yy) {
1314: VecGetArray(zz,&zarray);
1315: } else {
1316: zarray = yarray;
1317: }
1319: idx = a->j;
1320: v = a->a;
1321: if (usecprow){
1322: if (zz != yy){
1323: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1324: }
1325: mbs = a->compressedrow.nrows;
1326: ii = a->compressedrow.i;
1327: ridx = a->compressedrow.rindex;
1328: } else {
1329: ii = a->i;
1330: y = yarray;
1331: z = zarray;
1332: }
1334: for (i=0; i<mbs; i++) {
1335: n = ii[1] - ii[0]; ii++;
1336: if (usecprow){
1337: z = zarray + 5*ridx[i];
1338: y = yarray + 5*ridx[i];
1339: }
1340: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1341: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1342: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1343: for (j=0; j<n; j++) {
1344: xb = x + 5*(*idx++);
1345: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1346: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1347: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1348: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1349: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1350: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1351: v += 25;
1352: }
1353: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1354: if (!usecprow){
1355: z += 5; y += 5;
1356: }
1357: }
1358: VecRestoreArray(xx,&x);
1359: VecRestoreArray(yy,&yarray);
1360: if (zz != yy) {
1361: VecRestoreArray(zz,&zarray);
1362: }
1363: PetscLogFlops(50.0*a->nz);
1364: return(0);
1365: }
1368: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1369: {
1370: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1371: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1372: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1373: MatScalar *v;
1375: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1376: PetscBool usecprow=a->compressedrow.use;
1379: VecGetArray(xx,&x);
1380: VecGetArray(yy,&yarray);
1381: if (zz != yy) {
1382: VecGetArray(zz,&zarray);
1383: } else {
1384: zarray = yarray;
1385: }
1387: idx = a->j;
1388: v = a->a;
1389: if (usecprow){
1390: if (zz != yy){
1391: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1392: }
1393: mbs = a->compressedrow.nrows;
1394: ii = a->compressedrow.i;
1395: ridx = a->compressedrow.rindex;
1396: } else {
1397: ii = a->i;
1398: y = yarray;
1399: z = zarray;
1400: }
1402: for (i=0; i<mbs; i++) {
1403: n = ii[1] - ii[0]; ii++;
1404: if (usecprow){
1405: z = zarray + 6*ridx[i];
1406: y = yarray + 6*ridx[i];
1407: }
1408: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1409: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1410: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1411: for (j=0; j<n; j++) {
1412: xb = x + 6*(*idx++);
1413: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1414: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1415: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1416: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1417: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1418: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1419: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1420: v += 36;
1421: }
1422: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1423: if (!usecprow){
1424: z += 6; y += 6;
1425: }
1426: }
1427: VecRestoreArray(xx,&x);
1428: VecRestoreArray(yy,&yarray);
1429: if (zz != yy) {
1430: VecRestoreArray(zz,&zarray);
1431: }
1432: PetscLogFlops(72.0*a->nz);
1433: return(0);
1434: }
1438: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1439: {
1440: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1441: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1442: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1443: MatScalar *v;
1445: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1446: PetscBool usecprow=a->compressedrow.use;
1449: VecGetArray(xx,&x);
1450: VecGetArray(yy,&yarray);
1451: if (zz != yy) {
1452: VecGetArray(zz,&zarray);
1453: } else {
1454: zarray = yarray;
1455: }
1457: idx = a->j;
1458: v = a->a;
1459: if (usecprow){
1460: if (zz != yy){
1461: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1462: }
1463: mbs = a->compressedrow.nrows;
1464: ii = a->compressedrow.i;
1465: ridx = a->compressedrow.rindex;
1466: } else {
1467: ii = a->i;
1468: y = yarray;
1469: z = zarray;
1470: }
1472: for (i=0; i<mbs; i++) {
1473: n = ii[1] - ii[0]; ii++;
1474: if (usecprow){
1475: z = zarray + 7*ridx[i];
1476: y = yarray + 7*ridx[i];
1477: }
1478: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1479: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1480: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1481: for (j=0; j<n; j++) {
1482: xb = x + 7*(*idx++);
1483: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1484: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1485: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1486: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1487: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1488: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1489: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1490: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1491: v += 49;
1492: }
1493: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1494: if (!usecprow){
1495: z += 7; y += 7;
1496: }
1497: }
1498: VecRestoreArray(xx,&x);
1499: VecRestoreArray(yy,&yarray);
1500: if (zz != yy) {
1501: VecRestoreArray(zz,&zarray);
1502: }
1503: PetscLogFlops(98.0*a->nz);
1504: return(0);
1505: }
1509: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1510: {
1511: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1512: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1513: MatScalar *v;
1515: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1516: PetscInt ncols,k,*ridx=PETSC_NULL;
1517: PetscBool usecprow=a->compressedrow.use;
1520: VecCopy(yy,zz);
1521: VecGetArray(xx,&x);
1522: VecGetArray(zz,&zarray);
1524: idx = a->j;
1525: v = a->a;
1526: if (usecprow){
1527: mbs = a->compressedrow.nrows;
1528: ii = a->compressedrow.i;
1529: ridx = a->compressedrow.rindex;
1530: } else {
1531: mbs = a->mbs;
1532: ii = a->i;
1533: z = zarray;
1534: }
1536: if (!a->mult_work) {
1537: k = PetscMax(A->rmap->n,A->cmap->n);
1538: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1539: }
1540: work = a->mult_work;
1541: for (i=0; i<mbs; i++) {
1542: n = ii[1] - ii[0]; ii++;
1543: ncols = n*bs;
1544: workt = work;
1545: for (j=0; j<n; j++) {
1546: xb = x + bs*(*idx++);
1547: for (k=0; k<bs; k++) workt[k] = xb[k];
1548: workt += bs;
1549: }
1550: if (usecprow) z = zarray + bs*ridx[i];
1551: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1552: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1553: v += n*bs2;
1554: if (!usecprow){
1555: z += bs;
1556: }
1557: }
1558: VecRestoreArray(xx,&x);
1559: VecRestoreArray(zz,&zarray);
1560: PetscLogFlops(2.0*a->nz*bs2);
1561: return(0);
1562: }
1566: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1567: {
1568: PetscScalar zero = 0.0;
1572: VecSet(zz,zero);
1573: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1574: return(0);
1575: }
1579: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1580: {
1581: PetscScalar zero = 0.0;
1585: VecSet(zz,zero);
1586: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1587: return(0);
1588: }
1592: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1594: {
1595: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1596: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1597: MatScalar *v;
1598: PetscErrorCode ierr;
1599: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1600: Mat_CompressedRow cprow = a->compressedrow;
1601: PetscBool usecprow=cprow.use;
1604: if (yy != zz) { VecCopy(yy,zz); }
1605: VecGetArray(xx,&x);
1606: VecGetArray(zz,&z);
1608: idx = a->j;
1609: v = a->a;
1610: if (usecprow){
1611: mbs = cprow.nrows;
1612: ii = cprow.i;
1613: ridx = cprow.rindex;
1614: } else {
1615: mbs=a->mbs;
1616: ii = a->i;
1617: xb = x;
1618: }
1620: switch (bs) {
1621: case 1:
1622: for (i=0; i<mbs; i++) {
1623: if (usecprow) xb = x + ridx[i];
1624: x1 = xb[0];
1625: ib = idx + ii[0];
1626: n = ii[1] - ii[0]; ii++;
1627: for (j=0; j<n; j++) {
1628: rval = ib[j];
1629: z[rval] += PetscConj(*v) * x1;
1630: v++;
1631: }
1632: if (!usecprow) xb++;
1633: }
1634: break;
1635: case 2:
1636: for (i=0; i<mbs; i++) {
1637: if (usecprow) xb = x + 2*ridx[i];
1638: x1 = xb[0]; x2 = xb[1];
1639: ib = idx + ii[0];
1640: n = ii[1] - ii[0]; ii++;
1641: for (j=0; j<n; j++) {
1642: rval = ib[j]*2;
1643: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1644: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1645: v += 4;
1646: }
1647: if (!usecprow) xb += 2;
1648: }
1649: break;
1650: case 3:
1651: for (i=0; i<mbs; i++) {
1652: if (usecprow) xb = x + 3*ridx[i];
1653: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1654: ib = idx + ii[0];
1655: n = ii[1] - ii[0]; ii++;
1656: for (j=0; j<n; j++) {
1657: rval = ib[j]*3;
1658: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1659: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1660: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1661: v += 9;
1662: }
1663: if (!usecprow) xb += 3;
1664: }
1665: break;
1666: case 4:
1667: for (i=0; i<mbs; i++) {
1668: if (usecprow) xb = x + 4*ridx[i];
1669: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1670: ib = idx + ii[0];
1671: n = ii[1] - ii[0]; ii++;
1672: for (j=0; j<n; j++) {
1673: rval = ib[j]*4;
1674: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1675: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1676: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1677: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1678: v += 16;
1679: }
1680: if (!usecprow) xb += 4;
1681: }
1682: break;
1683: case 5:
1684: for (i=0; i<mbs; i++) {
1685: if (usecprow) xb = x + 5*ridx[i];
1686: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1687: x4 = xb[3]; x5 = xb[4];
1688: ib = idx + ii[0];
1689: n = ii[1] - ii[0]; ii++;
1690: for (j=0; j<n; j++) {
1691: rval = ib[j]*5;
1692: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1693: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1694: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1695: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1696: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1697: v += 25;
1698: }
1699: if (!usecprow) xb += 5;
1700: }
1701: break;
1702: default: { /* block sizes larger than 5 by 5 are handled by BLAS */
1703: PetscInt ncols,k;
1704: PetscScalar *work,*workt,*xtmp;
1706: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1707: if (!a->mult_work) {
1708: k = PetscMax(A->rmap->n,A->cmap->n);
1709: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1710: }
1711: work = a->mult_work;
1712: xtmp = x;
1713: for (i=0; i<mbs; i++) {
1714: n = ii[1] - ii[0]; ii++;
1715: ncols = n*bs;
1716: PetscMemzero(work,ncols*sizeof(PetscScalar));
1717: if (usecprow) {
1718: xtmp = x + bs*ridx[i];
1719: }
1720: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1721: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1722: v += n*bs2;
1723: if (!usecprow) xtmp += bs;
1724: workt = work;
1725: for (j=0; j<n; j++) {
1726: zb = z + bs*(*idx++);
1727: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1728: workt += bs;
1729: }
1730: }
1731: }
1732: }
1733: VecRestoreArray(xx,&x);
1734: VecRestoreArray(zz,&z);
1735: PetscLogFlops(2.0*a->nz*a->bs2);
1736: return(0);
1737: }
1741: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1742: {
1743: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1744: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1745: MatScalar *v;
1746: PetscErrorCode ierr;
1747: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1748: Mat_CompressedRow cprow = a->compressedrow;
1749: PetscBool usecprow=cprow.use;
1752: if (yy != zz) { VecCopy(yy,zz); }
1753: VecGetArray(xx,&x);
1754: VecGetArray(zz,&z);
1756: idx = a->j;
1757: v = a->a;
1758: if (usecprow){
1759: mbs = cprow.nrows;
1760: ii = cprow.i;
1761: ridx = cprow.rindex;
1762: } else {
1763: mbs=a->mbs;
1764: ii = a->i;
1765: xb = x;
1766: }
1768: switch (bs) {
1769: case 1:
1770: for (i=0; i<mbs; i++) {
1771: if (usecprow) xb = x + ridx[i];
1772: x1 = xb[0];
1773: ib = idx + ii[0];
1774: n = ii[1] - ii[0]; ii++;
1775: for (j=0; j<n; j++) {
1776: rval = ib[j];
1777: z[rval] += *v * x1;
1778: v++;
1779: }
1780: if (!usecprow) xb++;
1781: }
1782: break;
1783: case 2:
1784: for (i=0; i<mbs; i++) {
1785: if (usecprow) xb = x + 2*ridx[i];
1786: x1 = xb[0]; x2 = xb[1];
1787: ib = idx + ii[0];
1788: n = ii[1] - ii[0]; ii++;
1789: for (j=0; j<n; j++) {
1790: rval = ib[j]*2;
1791: z[rval++] += v[0]*x1 + v[1]*x2;
1792: z[rval++] += v[2]*x1 + v[3]*x2;
1793: v += 4;
1794: }
1795: if (!usecprow) xb += 2;
1796: }
1797: break;
1798: case 3:
1799: for (i=0; i<mbs; i++) {
1800: if (usecprow) xb = x + 3*ridx[i];
1801: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1802: ib = idx + ii[0];
1803: n = ii[1] - ii[0]; ii++;
1804: for (j=0; j<n; j++) {
1805: rval = ib[j]*3;
1806: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1807: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1808: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1809: v += 9;
1810: }
1811: if (!usecprow) xb += 3;
1812: }
1813: break;
1814: case 4:
1815: for (i=0; i<mbs; i++) {
1816: if (usecprow) xb = x + 4*ridx[i];
1817: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1818: ib = idx + ii[0];
1819: n = ii[1] - ii[0]; ii++;
1820: for (j=0; j<n; j++) {
1821: rval = ib[j]*4;
1822: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1823: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1824: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1825: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1826: v += 16;
1827: }
1828: if (!usecprow) xb += 4;
1829: }
1830: break;
1831: case 5:
1832: for (i=0; i<mbs; i++) {
1833: if (usecprow) xb = x + 5*ridx[i];
1834: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1835: x4 = xb[3]; x5 = xb[4];
1836: ib = idx + ii[0];
1837: n = ii[1] - ii[0]; ii++;
1838: for (j=0; j<n; j++) {
1839: rval = ib[j]*5;
1840: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1841: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1842: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1843: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1844: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1845: v += 25;
1846: }
1847: if (!usecprow) xb += 5;
1848: }
1849: break;
1850: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1851: PetscInt ncols,k;
1852: PetscScalar *work,*workt,*xtmp;
1854: if (!a->mult_work) {
1855: k = PetscMax(A->rmap->n,A->cmap->n);
1856: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1857: }
1858: work = a->mult_work;
1859: xtmp = x;
1860: for (i=0; i<mbs; i++) {
1861: n = ii[1] - ii[0]; ii++;
1862: ncols = n*bs;
1863: PetscMemzero(work,ncols*sizeof(PetscScalar));
1864: if (usecprow) {
1865: xtmp = x + bs*ridx[i];
1866: }
1867: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1868: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1869: v += n*bs2;
1870: if (!usecprow) xtmp += bs;
1871: workt = work;
1872: for (j=0; j<n; j++) {
1873: zb = z + bs*(*idx++);
1874: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1875: workt += bs;
1876: }
1877: }
1878: }
1879: }
1880: VecRestoreArray(xx,&x);
1881: VecRestoreArray(zz,&z);
1882: PetscLogFlops(2.0*a->nz*a->bs2);
1883: return(0);
1884: }
1888: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1889: {
1890: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1891: PetscInt totalnz = a->bs2*a->nz;
1892: PetscScalar oalpha = alpha;
1894: PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz);
1897: BLASscal_(&tnz,&oalpha,a->a,&one);
1898: PetscLogFlops(totalnz);
1899: return(0);
1900: }
1904: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1905: {
1907: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1908: MatScalar *v = a->a;
1909: PetscReal sum = 0.0;
1910: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1913: if (type == NORM_FROBENIUS) {
1914: for (i=0; i< bs2*nz; i++) {
1915: #if defined(PETSC_USE_COMPLEX)
1916: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1917: #else
1918: sum += (*v)*(*v); v++;
1919: #endif
1920: }
1921: *norm = PetscSqrtReal(sum);
1922: } else if (type == NORM_1) { /* maximum column sum */
1923: PetscReal *tmp;
1924: PetscInt *bcol = a->j;
1925: PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1926: PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1927: for (i=0; i<nz; i++){
1928: for (j=0; j<bs; j++){
1929: k1 = bs*(*bcol) + j; /* column index */
1930: for (k=0; k<bs; k++){
1931: tmp[k1] += PetscAbsScalar(*v); v++;
1932: }
1933: }
1934: bcol++;
1935: }
1936: *norm = 0.0;
1937: for (j=0; j<A->cmap->n; j++) {
1938: if (tmp[j] > *norm) *norm = tmp[j];
1939: }
1940: PetscFree(tmp);
1941: } else if (type == NORM_INFINITY) { /* maximum row sum */
1942: *norm = 0.0;
1943: for (k=0; k<bs; k++) {
1944: for (j=0; j<a->mbs; j++) {
1945: v = a->a + bs2*a->i[j] + k;
1946: sum = 0.0;
1947: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1948: for (k1=0; k1<bs; k1++){
1949: sum += PetscAbsScalar(*v);
1950: v += bs;
1951: }
1952: }
1953: if (sum > *norm) *norm = sum;
1954: }
1955: }
1956: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1957: return(0);
1958: }
1963: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1964: {
1965: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1969: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1970: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1971: *flg = PETSC_FALSE;
1972: return(0);
1973: }
1974:
1975: /* if the a->i are the same */
1976: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1977: if (!*flg) {
1978: return(0);
1979: }
1980:
1981: /* if a->j are the same */
1982: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1983: if (!*flg) {
1984: return(0);
1985: }
1986: /* if a->a are the same */
1987: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1988: return(0);
1989:
1990: }
1994: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1995: {
1996: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1998: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1999: PetscScalar *x,zero = 0.0;
2000: MatScalar *aa,*aa_j;
2003: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2004: bs = A->rmap->bs;
2005: aa = a->a;
2006: ai = a->i;
2007: aj = a->j;
2008: ambs = a->mbs;
2009: bs2 = a->bs2;
2011: VecSet(v,zero);
2012: VecGetArray(v,&x);
2013: VecGetLocalSize(v,&n);
2014: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2015: for (i=0; i<ambs; i++) {
2016: for (j=ai[i]; j<ai[i+1]; j++) {
2017: if (aj[j] == i) {
2018: row = i*bs;
2019: aa_j = aa+j*bs2;
2020: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2021: break;
2022: }
2023: }
2024: }
2025: VecRestoreArray(v,&x);
2026: return(0);
2027: }
2031: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2032: {
2033: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2034: const PetscScalar *l,*r,*li,*ri;
2035: PetscScalar x;
2036: MatScalar *aa, *v;
2037: PetscErrorCode ierr;
2038: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2039: const PetscInt *ai,*aj;
2042: ai = a->i;
2043: aj = a->j;
2044: aa = a->a;
2045: m = A->rmap->n;
2046: n = A->cmap->n;
2047: bs = A->rmap->bs;
2048: mbs = a->mbs;
2049: bs2 = a->bs2;
2050: if (ll) {
2051: VecGetArrayRead(ll,&l);
2052: VecGetLocalSize(ll,&lm);
2053: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2054: for (i=0; i<mbs; i++) { /* for each block row */
2055: M = ai[i+1] - ai[i];
2056: li = l + i*bs;
2057: v = aa + bs2*ai[i];
2058: for (j=0; j<M; j++) { /* for each block */
2059: for (k=0; k<bs2; k++) {
2060: (*v++) *= li[k%bs];
2061: }
2062: }
2063: }
2064: VecRestoreArrayRead(ll,&l);
2065: PetscLogFlops(a->nz);
2066: }
2067:
2068: if (rr) {
2069: VecGetArrayRead(rr,&r);
2070: VecGetLocalSize(rr,&rn);
2071: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2072: for (i=0; i<mbs; i++) { /* for each block row */
2073: iai = ai[i];
2074: M = ai[i+1] - iai;
2075: v = aa + bs2*iai;
2076: for (j=0; j<M; j++) { /* for each block */
2077: ri = r + bs*aj[iai+j];
2078: for (k=0; k<bs; k++) {
2079: x = ri[k];
2080: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2081: v += bs;
2082: }
2083: }
2084: }
2085: VecRestoreArrayRead(rr,&r);
2086: PetscLogFlops(a->nz);
2087: }
2088: return(0);
2089: }
2094: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2095: {
2096: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2099: info->block_size = a->bs2;
2100: info->nz_allocated = a->bs2*a->maxnz;
2101: info->nz_used = a->bs2*a->nz;
2102: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2103: info->assemblies = A->num_ass;
2104: info->mallocs = A->info.mallocs;
2105: info->memory = ((PetscObject)A)->mem;
2106: if (A->factortype) {
2107: info->fill_ratio_given = A->info.fill_ratio_given;
2108: info->fill_ratio_needed = A->info.fill_ratio_needed;
2109: info->factor_mallocs = A->info.factor_mallocs;
2110: } else {
2111: info->fill_ratio_given = 0;
2112: info->fill_ratio_needed = 0;
2113: info->factor_mallocs = 0;
2114: }
2115: return(0);
2116: }
2121: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2122: {
2123: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2127: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2128: return(0);
2129: }