Actual source code: baij2.c
petsc-3.6.1 2015-08-06
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
14: const PetscInt *idx;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,&table);
27: PetscMalloc1(m+1,&nidx);
28: PetscMalloc1(A->rmap->N+1,&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n; ++j) {
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscCalloc1(1+oldcols,&smap);
96: ssmap = smap;
97: PetscMalloc1(1+nrows,&lens);
98: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99: /* determine lens of each row */
100: for (i=0; i<nrows; i++) {
101: kstart = ai[irow[i]];
102: kend = kstart + a->ilen[irow[i]];
103: lens[i] = 0;
104: for (k=kstart; k<kend; k++) {
105: if (ssmap[aj[k]]) lens[i]++;
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ*)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
114: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
116: C = *B;
117: } else {
118: MatCreate(PetscObjectComm((PetscObject)A),&C);
119: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120: MatSetType(C,((PetscObject)A)->type_name);
121: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
122: }
123: c = (Mat_SeqBAIJ*)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141: /* sort */
142: {
143: MatScalar *work;
145: PetscMalloc1(bs2,&work);
146: for (i=0; i<nrows; i++) {
147: PetscInt ilen;
148: mat_i = c->i[i];
149: mat_j = c->j + mat_i;
150: mat_a = c->a + mat_i*bs2;
151: ilen = c->ilen[i];
152: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
153: }
154: PetscFree(work);
155: }
157: /* Free work space */
158: ISRestoreIndices(iscol,&icol);
159: PetscFree(smap);
160: PetscFree(lens);
161: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
162: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
164: ISRestoreIndices(isrow,&irow);
165: *B = C;
166: return(0);
167: }
171: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
172: {
173: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
174: IS is1,is2;
176: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
177: const PetscInt *irow,*icol;
180: ISGetIndices(isrow,&irow);
181: ISGetIndices(iscol,&icol);
182: ISGetLocalSize(isrow,&nrows);
183: ISGetLocalSize(iscol,&ncols);
185: /* Verify if the indices corespond to each element in a block
186: and form the IS with compressed IS */
187: maxmnbs = PetscMax(a->mbs,a->nbs);
188: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
189: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
190: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
191: for (i=0; i<a->mbs; i++) {
192: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
193: }
194: count = 0;
195: for (i=0; i<nrows; i++) {
196: PetscInt j = irow[i] / bs;
197: if ((vary[j]--)==bs) iary[count++] = j;
198: }
199: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
201: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
202: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
203: for (i=0; i<a->nbs; i++) {
204: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
205: }
206: count = 0;
207: for (i=0; i<ncols; i++) {
208: PetscInt j = icol[i] / bs;
209: if ((vary[j]--)==bs) iary[count++] = j;
210: }
211: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
212: ISRestoreIndices(isrow,&irow);
213: ISRestoreIndices(iscol,&icol);
214: PetscFree2(vary,iary);
216: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
217: ISDestroy(&is1);
218: ISDestroy(&is2);
219: return(0);
220: }
224: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
225: {
227: PetscInt i;
230: if (scall == MAT_INITIAL_MATRIX) {
231: PetscMalloc1(n+1,B);
232: }
234: for (i=0; i<n; i++) {
235: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
236: }
237: return(0);
238: }
241: /* -------------------------------------------------------*/
242: /* Should check that shapes of vectors and matrices match */
243: /* -------------------------------------------------------*/
247: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
248: {
249: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
250: PetscScalar *z,sum;
251: const PetscScalar *x;
252: const MatScalar *v;
253: PetscErrorCode ierr;
254: PetscInt mbs,i,n;
255: const PetscInt *idx,*ii,*ridx=NULL;
256: PetscBool usecprow=a->compressedrow.use;
259: VecGetArrayRead(xx,&x);
260: VecGetArray(zz,&z);
262: if (usecprow) {
263: mbs = a->compressedrow.nrows;
264: ii = a->compressedrow.i;
265: ridx = a->compressedrow.rindex;
266: PetscMemzero(z,mbs*sizeof(PetscScalar));
267: } else {
268: mbs = a->mbs;
269: ii = a->i;
270: }
272: for (i=0; i<mbs; i++) {
273: n = ii[1] - ii[0];
274: v = a->a + ii[0];
275: idx = a->j + ii[0];
276: ii++;
277: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
278: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
279: sum = 0.0;
280: PetscSparseDensePlusDot(sum,x,v,idx,n);
281: if (usecprow) {
282: z[ridx[i]] = sum;
283: } else {
284: z[i] = sum;
285: }
286: }
287: VecRestoreArrayRead(xx,&x);
288: VecRestoreArray(zz,&z);
289: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
290: return(0);
291: }
295: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
296: {
297: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
298: PetscScalar *z = 0,sum1,sum2,*zarray;
299: const PetscScalar *x,*xb;
300: PetscScalar x1,x2;
301: const MatScalar *v;
302: PetscErrorCode ierr;
303: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
304: PetscBool usecprow=a->compressedrow.use;
307: VecGetArrayRead(xx,&x);
308: VecGetArray(zz,&zarray);
310: idx = a->j;
311: v = a->a;
312: if (usecprow) {
313: mbs = a->compressedrow.nrows;
314: ii = a->compressedrow.i;
315: ridx = a->compressedrow.rindex;
316: } else {
317: mbs = a->mbs;
318: ii = a->i;
319: z = zarray;
320: }
322: for (i=0; i<mbs; i++) {
323: n = ii[1] - ii[0]; ii++;
324: sum1 = 0.0; sum2 = 0.0;
325: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
326: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
327: for (j=0; j<n; j++) {
328: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
329: sum1 += v[0]*x1 + v[2]*x2;
330: sum2 += v[1]*x1 + v[3]*x2;
331: v += 4;
332: }
333: if (usecprow) z = zarray + 2*ridx[i];
334: z[0] = sum1; z[1] = sum2;
335: if (!usecprow) z += 2;
336: }
337: VecRestoreArrayRead(xx,&x);
338: VecRestoreArray(zz,&zarray);
339: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
340: return(0);
341: }
345: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
346: {
347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
348: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
349: const PetscScalar *x,*xb;
350: const MatScalar *v;
351: PetscErrorCode ierr;
352: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
353: PetscBool usecprow=a->compressedrow.use;
355: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
356: #pragma disjoint(*v,*z,*xb)
357: #endif
360: VecGetArrayRead(xx,&x);
361: VecGetArray(zz,&zarray);
363: idx = a->j;
364: v = a->a;
365: if (usecprow) {
366: mbs = a->compressedrow.nrows;
367: ii = a->compressedrow.i;
368: ridx = a->compressedrow.rindex;
369: } else {
370: mbs = a->mbs;
371: ii = a->i;
372: z = zarray;
373: }
375: for (i=0; i<mbs; i++) {
376: n = ii[1] - ii[0]; ii++;
377: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
378: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
379: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380: for (j=0; j<n; j++) {
381: xb = x + 3*(*idx++);
382: x1 = xb[0];
383: x2 = xb[1];
384: x3 = xb[2];
386: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
387: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
388: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
389: v += 9;
390: }
391: if (usecprow) z = zarray + 3*ridx[i];
392: z[0] = sum1; z[1] = sum2; z[2] = sum3;
393: if (!usecprow) z += 3;
394: }
395: VecRestoreArrayRead(xx,&x);
396: VecRestoreArray(zz,&zarray);
397: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
398: return(0);
399: }
403: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
404: {
405: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
406: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
407: const PetscScalar *x,*xb;
408: const MatScalar *v;
409: PetscErrorCode ierr;
410: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
411: PetscBool usecprow=a->compressedrow.use;
414: VecGetArrayRead(xx,&x);
415: VecGetArray(zz,&zarray);
417: idx = a->j;
418: v = a->a;
419: if (usecprow) {
420: mbs = a->compressedrow.nrows;
421: ii = a->compressedrow.i;
422: ridx = a->compressedrow.rindex;
423: } else {
424: mbs = a->mbs;
425: ii = a->i;
426: z = zarray;
427: }
429: for (i=0; i<mbs; i++) {
430: n = ii[1] - ii[0];
431: ii++;
432: sum1 = 0.0;
433: sum2 = 0.0;
434: sum3 = 0.0;
435: sum4 = 0.0;
437: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
438: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
439: for (j=0; j<n; j++) {
440: xb = x + 4*(*idx++);
441: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
442: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
443: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
444: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
445: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
446: v += 16;
447: }
448: if (usecprow) z = zarray + 4*ridx[i];
449: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
450: if (!usecprow) z += 4;
451: }
452: VecRestoreArrayRead(xx,&x);
453: VecRestoreArray(zz,&zarray);
454: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
455: return(0);
456: }
460: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
461: {
462: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
463: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
464: const PetscScalar *xb,*x;
465: const MatScalar *v;
466: PetscErrorCode ierr;
467: const PetscInt *idx,*ii,*ridx=NULL;
468: PetscInt mbs,i,j,n;
469: PetscBool usecprow=a->compressedrow.use;
472: VecGetArrayRead(xx,&x);
473: VecGetArray(zz,&zarray);
475: idx = a->j;
476: v = a->a;
477: if (usecprow) {
478: mbs = a->compressedrow.nrows;
479: ii = a->compressedrow.i;
480: ridx = a->compressedrow.rindex;
481: } else {
482: mbs = a->mbs;
483: ii = a->i;
484: z = zarray;
485: }
487: for (i=0; i<mbs; i++) {
488: n = ii[1] - ii[0]; ii++;
489: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
490: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
491: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
492: for (j=0; j<n; j++) {
493: xb = x + 5*(*idx++);
494: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
495: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
496: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
497: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
498: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
499: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
500: v += 25;
501: }
502: if (usecprow) z = zarray + 5*ridx[i];
503: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
504: if (!usecprow) z += 5;
505: }
506: VecRestoreArrayRead(xx,&x);
507: VecRestoreArray(zz,&zarray);
508: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
509: return(0);
510: }
516: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
517: {
518: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
519: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
520: const PetscScalar *x,*xb;
521: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
522: const MatScalar *v;
523: PetscErrorCode ierr;
524: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
525: PetscBool usecprow=a->compressedrow.use;
528: VecGetArrayRead(xx,&x);
529: VecGetArray(zz,&zarray);
531: idx = a->j;
532: v = a->a;
533: if (usecprow) {
534: mbs = a->compressedrow.nrows;
535: ii = a->compressedrow.i;
536: ridx = a->compressedrow.rindex;
537: } else {
538: mbs = a->mbs;
539: ii = a->i;
540: z = zarray;
541: }
543: for (i=0; i<mbs; i++) {
544: n = ii[1] - ii[0];
545: ii++;
546: sum1 = 0.0;
547: sum2 = 0.0;
548: sum3 = 0.0;
549: sum4 = 0.0;
550: sum5 = 0.0;
551: sum6 = 0.0;
553: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
554: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
555: for (j=0; j<n; j++) {
556: xb = x + 6*(*idx++);
557: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
558: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
559: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
560: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
561: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
562: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
563: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
564: v += 36;
565: }
566: if (usecprow) z = zarray + 6*ridx[i];
567: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
568: if (!usecprow) z += 6;
569: }
571: VecRestoreArrayRead(xx,&x);
572: VecRestoreArray(zz,&zarray);
573: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
574: return(0);
575: }
579: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
580: {
581: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
582: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
583: const PetscScalar *x,*xb;
584: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
585: const MatScalar *v;
586: PetscErrorCode ierr;
587: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
588: PetscBool usecprow=a->compressedrow.use;
591: VecGetArrayRead(xx,&x);
592: VecGetArray(zz,&zarray);
594: idx = a->j;
595: v = a->a;
596: if (usecprow) {
597: mbs = a->compressedrow.nrows;
598: ii = a->compressedrow.i;
599: ridx = a->compressedrow.rindex;
600: } else {
601: mbs = a->mbs;
602: ii = a->i;
603: z = zarray;
604: }
606: for (i=0; i<mbs; i++) {
607: n = ii[1] - ii[0];
608: ii++;
609: sum1 = 0.0;
610: sum2 = 0.0;
611: sum3 = 0.0;
612: sum4 = 0.0;
613: sum5 = 0.0;
614: sum6 = 0.0;
615: sum7 = 0.0;
617: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
618: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
619: for (j=0; j<n; j++) {
620: xb = x + 7*(*idx++);
621: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
622: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
623: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
624: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
625: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
626: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
627: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
628: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
629: v += 49;
630: }
631: if (usecprow) z = zarray + 7*ridx[i];
632: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
633: if (!usecprow) z += 7;
634: }
636: VecRestoreArrayRead(xx,&x);
637: VecRestoreArray(zz,&zarray);
638: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
639: return(0);
640: }
642: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
643: /* Default MatMult for block size 15 */
647: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
648: {
649: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
650: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
651: const PetscScalar *x,*xb;
652: PetscScalar *zarray,xv;
653: const MatScalar *v;
654: PetscErrorCode ierr;
655: const PetscInt *ii,*ij=a->j,*idx;
656: PetscInt mbs,i,j,k,n,*ridx=NULL;
657: PetscBool usecprow=a->compressedrow.use;
660: VecGetArrayRead(xx,&x);
661: VecGetArray(zz,&zarray);
663: v = a->a;
664: if (usecprow) {
665: mbs = a->compressedrow.nrows;
666: ii = a->compressedrow.i;
667: ridx = a->compressedrow.rindex;
668: } else {
669: mbs = a->mbs;
670: ii = a->i;
671: z = zarray;
672: }
674: for (i=0; i<mbs; i++) {
675: n = ii[i+1] - ii[i];
676: idx = ij + ii[i];
677: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
678: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
680: for (j=0; j<n; j++) {
681: xb = x + 15*(idx[j]);
683: for (k=0; k<15; k++) {
684: xv = xb[k];
685: sum1 += v[0]*xv;
686: sum2 += v[1]*xv;
687: sum3 += v[2]*xv;
688: sum4 += v[3]*xv;
689: sum5 += v[4]*xv;
690: sum6 += v[5]*xv;
691: sum7 += v[6]*xv;
692: sum8 += v[7]*xv;
693: sum9 += v[8]*xv;
694: sum10 += v[9]*xv;
695: sum11 += v[10]*xv;
696: sum12 += v[11]*xv;
697: sum13 += v[12]*xv;
698: sum14 += v[13]*xv;
699: sum15 += v[14]*xv;
700: v += 15;
701: }
702: }
703: if (usecprow) z = zarray + 15*ridx[i];
704: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
705: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
707: if (!usecprow) z += 15;
708: }
710: VecRestoreArrayRead(xx,&x);
711: VecRestoreArray(zz,&zarray);
712: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
713: return(0);
714: }
716: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
719: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
720: {
721: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
722: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
723: const PetscScalar *x,*xb;
724: PetscScalar x1,x2,x3,x4,*zarray;
725: const MatScalar *v;
726: PetscErrorCode ierr;
727: const PetscInt *ii,*ij=a->j,*idx;
728: PetscInt mbs,i,j,n,*ridx=NULL;
729: PetscBool usecprow=a->compressedrow.use;
732: VecGetArrayRead(xx,&x);
733: VecGetArray(zz,&zarray);
735: v = a->a;
736: if (usecprow) {
737: mbs = a->compressedrow.nrows;
738: ii = a->compressedrow.i;
739: ridx = a->compressedrow.rindex;
740: } else {
741: mbs = a->mbs;
742: ii = a->i;
743: z = zarray;
744: }
746: for (i=0; i<mbs; i++) {
747: n = ii[i+1] - ii[i];
748: idx = ij + ii[i];
749: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
750: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
752: for (j=0; j<n; j++) {
753: xb = x + 15*(idx[j]);
754: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
756: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
757: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
758: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
759: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
760: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
761: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
762: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
763: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
764: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
765: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
766: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
767: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
768: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
769: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
770: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
772: v += 60;
774: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
776: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
777: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
778: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
779: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
780: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
781: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
782: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
783: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
784: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
785: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
786: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
787: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
788: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
789: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
790: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
791: v += 60;
793: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
794: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
795: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
796: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
797: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
798: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
799: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
800: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
801: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
802: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
803: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
804: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
805: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
806: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
807: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
808: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
809: v += 60;
811: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
812: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
813: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
814: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
815: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
816: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
817: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
818: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
819: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
820: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
821: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
822: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
823: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
824: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
825: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
826: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
827: v += 45;
828: }
829: if (usecprow) z = zarray + 15*ridx[i];
830: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
831: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
833: if (!usecprow) z += 15;
834: }
836: VecRestoreArrayRead(xx,&x);
837: VecRestoreArray(zz,&zarray);
838: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
839: return(0);
840: }
842: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
845: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
846: {
847: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
848: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
849: const PetscScalar *x,*xb;
850: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
851: const MatScalar *v;
852: PetscErrorCode ierr;
853: const PetscInt *ii,*ij=a->j,*idx;
854: PetscInt mbs,i,j,n,*ridx=NULL;
855: PetscBool usecprow=a->compressedrow.use;
858: VecGetArrayRead(xx,&x);
859: VecGetArray(zz,&zarray);
861: v = a->a;
862: if (usecprow) {
863: mbs = a->compressedrow.nrows;
864: ii = a->compressedrow.i;
865: ridx = a->compressedrow.rindex;
866: } else {
867: mbs = a->mbs;
868: ii = a->i;
869: z = zarray;
870: }
872: for (i=0; i<mbs; i++) {
873: n = ii[i+1] - ii[i];
874: idx = ij + ii[i];
875: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
876: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
878: for (j=0; j<n; j++) {
879: xb = x + 15*(idx[j]);
880: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
881: x8 = xb[7];
883: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
884: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
885: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
886: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
887: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
888: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
889: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
890: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
891: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
892: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
893: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
894: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
895: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
896: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
897: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
898: v += 120;
900: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
902: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
903: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
904: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
905: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
906: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
907: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
908: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
909: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
910: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
911: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
912: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
913: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
914: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
915: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
916: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
917: v += 105;
918: }
919: if (usecprow) z = zarray + 15*ridx[i];
920: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
921: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
923: if (!usecprow) z += 15;
924: }
926: VecRestoreArrayRead(xx,&x);
927: VecRestoreArray(zz,&zarray);
928: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
929: return(0);
930: }
932: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
936: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
937: {
938: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
939: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
940: const PetscScalar *x,*xb;
941: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
942: const MatScalar *v;
943: PetscErrorCode ierr;
944: const PetscInt *ii,*ij=a->j,*idx;
945: PetscInt mbs,i,j,n,*ridx=NULL;
946: PetscBool usecprow=a->compressedrow.use;
949: VecGetArrayRead(xx,&x);
950: VecGetArray(zz,&zarray);
952: v = a->a;
953: if (usecprow) {
954: mbs = a->compressedrow.nrows;
955: ii = a->compressedrow.i;
956: ridx = a->compressedrow.rindex;
957: } else {
958: mbs = a->mbs;
959: ii = a->i;
960: z = zarray;
961: }
963: for (i=0; i<mbs; i++) {
964: n = ii[i+1] - ii[i];
965: idx = ij + ii[i];
966: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
967: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
969: for (j=0; j<n; j++) {
970: xb = x + 15*(idx[j]);
971: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
972: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
974: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
975: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
976: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
977: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
978: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
979: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
980: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
981: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
982: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
983: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
984: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
985: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
986: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
987: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
988: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
989: v += 225;
990: }
991: if (usecprow) z = zarray + 15*ridx[i];
992: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
993: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
995: if (!usecprow) z += 15;
996: }
998: VecRestoreArrayRead(xx,&x);
999: VecRestoreArray(zz,&zarray);
1000: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1001: return(0);
1002: }
1005: /*
1006: This will not work with MatScalar == float because it calls the BLAS
1007: */
1010: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1011: {
1012: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1013: PetscScalar *z = 0,*work,*workt,*zarray;
1014: const PetscScalar *x,*xb;
1015: const MatScalar *v;
1016: PetscErrorCode ierr;
1017: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1018: const PetscInt *idx,*ii,*ridx=NULL;
1019: PetscInt ncols,k;
1020: PetscBool usecprow=a->compressedrow.use;
1023: VecGetArrayRead(xx,&x);
1024: VecGetArray(zz,&zarray);
1026: idx = a->j;
1027: v = a->a;
1028: if (usecprow) {
1029: mbs = a->compressedrow.nrows;
1030: ii = a->compressedrow.i;
1031: ridx = a->compressedrow.rindex;
1032: } else {
1033: mbs = a->mbs;
1034: ii = a->i;
1035: z = zarray;
1036: }
1038: if (!a->mult_work) {
1039: k = PetscMax(A->rmap->n,A->cmap->n);
1040: PetscMalloc1(k+1,&a->mult_work);
1041: }
1042: work = a->mult_work;
1043: for (i=0; i<mbs; i++) {
1044: n = ii[1] - ii[0]; ii++;
1045: ncols = n*bs;
1046: workt = work;
1047: for (j=0; j<n; j++) {
1048: xb = x + bs*(*idx++);
1049: for (k=0; k<bs; k++) workt[k] = xb[k];
1050: workt += bs;
1051: }
1052: if (usecprow) z = zarray + bs*ridx[i];
1053: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1054: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1055: v += n*bs2;
1056: if (!usecprow) z += bs;
1057: }
1058: VecRestoreArrayRead(xx,&x);
1059: VecRestoreArray(zz,&zarray);
1060: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1061: return(0);
1062: }
1066: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1067: {
1068: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1069: const PetscScalar *x;
1070: PetscScalar *y,*z,sum;
1071: const MatScalar *v;
1072: PetscErrorCode ierr;
1073: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1074: const PetscInt *idx,*ii;
1075: PetscBool usecprow=a->compressedrow.use;
1078: VecGetArrayRead(xx,&x);
1079: VecGetArrayPair(yy,zz,&y,&z);
1081: idx = a->j;
1082: v = a->a;
1083: if (usecprow) {
1084: if (zz != yy) {
1085: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1086: }
1087: mbs = a->compressedrow.nrows;
1088: ii = a->compressedrow.i;
1089: ridx = a->compressedrow.rindex;
1090: } else {
1091: ii = a->i;
1092: }
1094: for (i=0; i<mbs; i++) {
1095: n = ii[1] - ii[0];
1096: ii++;
1097: if (!usecprow) {
1098: sum = y[i];
1099: } else {
1100: sum = y[ridx[i]];
1101: }
1102: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1103: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1104: PetscSparseDensePlusDot(sum,x,v,idx,n);
1105: v += n;
1106: idx += n;
1107: if (usecprow) {
1108: z[ridx[i]] = sum;
1109: } else {
1110: z[i] = sum;
1111: }
1112: }
1113: VecRestoreArrayRead(xx,&x);
1114: VecRestoreArrayPair(yy,zz,&y,&z);
1115: PetscLogFlops(2.0*a->nz);
1116: return(0);
1117: }
1121: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122: {
1123: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1124: PetscScalar *y = 0,*z = 0,sum1,sum2;
1125: const PetscScalar *x,*xb;
1126: PetscScalar x1,x2,*yarray,*zarray;
1127: const MatScalar *v;
1128: PetscErrorCode ierr;
1129: PetscInt mbs = a->mbs,i,n,j;
1130: const PetscInt *idx,*ii,*ridx = NULL;
1131: PetscBool usecprow = a->compressedrow.use;
1134: VecGetArrayRead(xx,&x);
1135: VecGetArrayPair(yy,zz,&yarray,&zarray);
1137: idx = a->j;
1138: v = a->a;
1139: if (usecprow) {
1140: if (zz != yy) {
1141: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1142: }
1143: mbs = a->compressedrow.nrows;
1144: ii = a->compressedrow.i;
1145: ridx = a->compressedrow.rindex;
1146: if (zz != yy) {
1147: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1148: }
1149: } else {
1150: ii = a->i;
1151: y = yarray;
1152: z = zarray;
1153: }
1155: for (i=0; i<mbs; i++) {
1156: n = ii[1] - ii[0]; ii++;
1157: if (usecprow) {
1158: z = zarray + 2*ridx[i];
1159: y = yarray + 2*ridx[i];
1160: }
1161: sum1 = y[0]; sum2 = y[1];
1162: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1163: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1164: for (j=0; j<n; j++) {
1165: xb = x + 2*(*idx++);
1166: x1 = xb[0];
1167: x2 = xb[1];
1169: sum1 += v[0]*x1 + v[2]*x2;
1170: sum2 += v[1]*x1 + v[3]*x2;
1171: v += 4;
1172: }
1173: z[0] = sum1; z[1] = sum2;
1174: if (!usecprow) {
1175: z += 2; y += 2;
1176: }
1177: }
1178: VecRestoreArrayRead(xx,&x);
1179: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1180: PetscLogFlops(4.0*a->nz);
1181: return(0);
1182: }
1186: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1187: {
1188: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1189: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1190: const PetscScalar *x,*xb;
1191: const MatScalar *v;
1192: PetscErrorCode ierr;
1193: PetscInt mbs = a->mbs,i,j,n;
1194: const PetscInt *idx,*ii,*ridx = NULL;
1195: PetscBool usecprow = a->compressedrow.use;
1198: VecGetArrayRead(xx,&x);
1199: VecGetArrayPair(yy,zz,&yarray,&zarray);
1201: idx = a->j;
1202: v = a->a;
1203: if (usecprow) {
1204: if (zz != yy) {
1205: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1206: }
1207: mbs = a->compressedrow.nrows;
1208: ii = a->compressedrow.i;
1209: ridx = a->compressedrow.rindex;
1210: } else {
1211: ii = a->i;
1212: y = yarray;
1213: z = zarray;
1214: }
1216: for (i=0; i<mbs; i++) {
1217: n = ii[1] - ii[0]; ii++;
1218: if (usecprow) {
1219: z = zarray + 3*ridx[i];
1220: y = yarray + 3*ridx[i];
1221: }
1222: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1223: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1224: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1225: for (j=0; j<n; j++) {
1226: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1227: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1228: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1229: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1230: v += 9;
1231: }
1232: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1233: if (!usecprow) {
1234: z += 3; y += 3;
1235: }
1236: }
1237: VecRestoreArrayRead(xx,&x);
1238: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1239: PetscLogFlops(18.0*a->nz);
1240: return(0);
1241: }
1245: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1246: {
1247: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1248: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1249: const PetscScalar *x,*xb;
1250: const MatScalar *v;
1251: PetscErrorCode ierr;
1252: PetscInt mbs = a->mbs,i,j,n;
1253: const PetscInt *idx,*ii,*ridx=NULL;
1254: PetscBool usecprow=a->compressedrow.use;
1257: VecGetArrayRead(xx,&x);
1258: VecGetArrayPair(yy,zz,&yarray,&zarray);
1260: idx = a->j;
1261: v = a->a;
1262: if (usecprow) {
1263: if (zz != yy) {
1264: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1265: }
1266: mbs = a->compressedrow.nrows;
1267: ii = a->compressedrow.i;
1268: ridx = a->compressedrow.rindex;
1269: } else {
1270: ii = a->i;
1271: y = yarray;
1272: z = zarray;
1273: }
1275: for (i=0; i<mbs; i++) {
1276: n = ii[1] - ii[0]; ii++;
1277: if (usecprow) {
1278: z = zarray + 4*ridx[i];
1279: y = yarray + 4*ridx[i];
1280: }
1281: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1282: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1283: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1284: for (j=0; j<n; j++) {
1285: xb = x + 4*(*idx++);
1286: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1287: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1288: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1289: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1290: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1291: v += 16;
1292: }
1293: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1294: if (!usecprow) {
1295: z += 4; y += 4;
1296: }
1297: }
1298: VecRestoreArrayRead(xx,&x);
1299: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1300: PetscLogFlops(32.0*a->nz);
1301: return(0);
1302: }
1306: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1307: {
1308: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1309: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1310: const PetscScalar *x,*xb;
1311: PetscScalar *yarray,*zarray;
1312: const MatScalar *v;
1313: PetscErrorCode ierr;
1314: PetscInt mbs = a->mbs,i,j,n;
1315: const PetscInt *idx,*ii,*ridx = NULL;
1316: PetscBool usecprow=a->compressedrow.use;
1319: VecGetArrayRead(xx,&x);
1320: VecGetArrayPair(yy,zz,&yarray,&zarray);
1322: idx = a->j;
1323: v = a->a;
1324: if (usecprow) {
1325: if (zz != yy) {
1326: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1327: }
1328: mbs = a->compressedrow.nrows;
1329: ii = a->compressedrow.i;
1330: ridx = a->compressedrow.rindex;
1331: } else {
1332: ii = a->i;
1333: y = yarray;
1334: z = zarray;
1335: }
1337: for (i=0; i<mbs; i++) {
1338: n = ii[1] - ii[0]; ii++;
1339: if (usecprow) {
1340: z = zarray + 5*ridx[i];
1341: y = yarray + 5*ridx[i];
1342: }
1343: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1344: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1345: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1346: for (j=0; j<n; j++) {
1347: xb = x + 5*(*idx++);
1348: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1349: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1350: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1351: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1352: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1353: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1354: v += 25;
1355: }
1356: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1357: if (!usecprow) {
1358: z += 5; y += 5;
1359: }
1360: }
1361: VecRestoreArrayRead(xx,&x);
1362: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1363: PetscLogFlops(50.0*a->nz);
1364: return(0);
1365: }
1368: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1369: {
1370: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1371: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1372: const PetscScalar *x,*xb;
1373: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1374: const MatScalar *v;
1375: PetscErrorCode ierr;
1376: PetscInt mbs = a->mbs,i,j,n;
1377: const PetscInt *idx,*ii,*ridx=NULL;
1378: PetscBool usecprow=a->compressedrow.use;
1381: VecGetArrayRead(xx,&x);
1382: VecGetArrayPair(yy,zz,&yarray,&zarray);
1384: idx = a->j;
1385: v = a->a;
1386: if (usecprow) {
1387: if (zz != yy) {
1388: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1389: }
1390: mbs = a->compressedrow.nrows;
1391: ii = a->compressedrow.i;
1392: ridx = a->compressedrow.rindex;
1393: } else {
1394: ii = a->i;
1395: y = yarray;
1396: z = zarray;
1397: }
1399: for (i=0; i<mbs; i++) {
1400: n = ii[1] - ii[0]; ii++;
1401: if (usecprow) {
1402: z = zarray + 6*ridx[i];
1403: y = yarray + 6*ridx[i];
1404: }
1405: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1406: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1407: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1408: for (j=0; j<n; j++) {
1409: xb = x + 6*(*idx++);
1410: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1411: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1412: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1413: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1414: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1415: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1416: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1417: v += 36;
1418: }
1419: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1420: if (!usecprow) {
1421: z += 6; y += 6;
1422: }
1423: }
1424: VecRestoreArrayRead(xx,&x);
1425: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1426: PetscLogFlops(72.0*a->nz);
1427: return(0);
1428: }
1432: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1433: {
1434: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1435: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1436: const PetscScalar *x,*xb;
1437: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1438: const MatScalar *v;
1439: PetscErrorCode ierr;
1440: PetscInt mbs = a->mbs,i,j,n;
1441: const PetscInt *idx,*ii,*ridx = NULL;
1442: PetscBool usecprow=a->compressedrow.use;
1445: VecGetArrayRead(xx,&x);
1446: VecGetArrayPair(yy,zz,&yarray,&zarray);
1448: idx = a->j;
1449: v = a->a;
1450: if (usecprow) {
1451: if (zz != yy) {
1452: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1453: }
1454: mbs = a->compressedrow.nrows;
1455: ii = a->compressedrow.i;
1456: ridx = a->compressedrow.rindex;
1457: } else {
1458: ii = a->i;
1459: y = yarray;
1460: z = zarray;
1461: }
1463: for (i=0; i<mbs; i++) {
1464: n = ii[1] - ii[0]; ii++;
1465: if (usecprow) {
1466: z = zarray + 7*ridx[i];
1467: y = yarray + 7*ridx[i];
1468: }
1469: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1470: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1471: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1472: for (j=0; j<n; j++) {
1473: xb = x + 7*(*idx++);
1474: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1475: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1476: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1477: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1478: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1479: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1480: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1481: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1482: v += 49;
1483: }
1484: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1485: if (!usecprow) {
1486: z += 7; y += 7;
1487: }
1488: }
1489: VecRestoreArrayRead(xx,&x);
1490: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1491: PetscLogFlops(98.0*a->nz);
1492: return(0);
1493: }
1497: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1498: {
1499: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1500: PetscScalar *z = 0,*work,*workt,*zarray;
1501: const PetscScalar *x,*xb;
1502: const MatScalar *v;
1503: PetscErrorCode ierr;
1504: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1505: PetscInt ncols,k;
1506: const PetscInt *ridx = NULL,*idx,*ii;
1507: PetscBool usecprow = a->compressedrow.use;
1510: VecCopy(yy,zz);
1511: VecGetArrayRead(xx,&x);
1512: VecGetArray(zz,&zarray);
1514: idx = a->j;
1515: v = a->a;
1516: if (usecprow) {
1517: mbs = a->compressedrow.nrows;
1518: ii = a->compressedrow.i;
1519: ridx = a->compressedrow.rindex;
1520: } else {
1521: mbs = a->mbs;
1522: ii = a->i;
1523: z = zarray;
1524: }
1526: if (!a->mult_work) {
1527: k = PetscMax(A->rmap->n,A->cmap->n);
1528: PetscMalloc1(k+1,&a->mult_work);
1529: }
1530: work = a->mult_work;
1531: for (i=0; i<mbs; i++) {
1532: n = ii[1] - ii[0]; ii++;
1533: ncols = n*bs;
1534: workt = work;
1535: for (j=0; j<n; j++) {
1536: xb = x + bs*(*idx++);
1537: for (k=0; k<bs; k++) workt[k] = xb[k];
1538: workt += bs;
1539: }
1540: if (usecprow) z = zarray + bs*ridx[i];
1541: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1542: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1543: v += n*bs2;
1544: if (!usecprow) z += bs;
1545: }
1546: VecRestoreArrayRead(xx,&x);
1547: VecRestoreArray(zz,&zarray);
1548: PetscLogFlops(2.0*a->nz*bs2);
1549: return(0);
1550: }
1554: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1555: {
1556: PetscScalar zero = 0.0;
1560: VecSet(zz,zero);
1561: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1562: return(0);
1563: }
1567: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1568: {
1569: PetscScalar zero = 0.0;
1573: VecSet(zz,zero);
1574: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1575: return(0);
1576: }
1580: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1581: {
1582: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1583: PetscScalar *z,x1,x2,x3,x4,x5;
1584: const PetscScalar *x,*xb = NULL;
1585: const MatScalar *v;
1586: PetscErrorCode ierr;
1587: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1588: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1589: Mat_CompressedRow cprow = a->compressedrow;
1590: PetscBool usecprow = cprow.use;
1593: if (yy != zz) { VecCopy(yy,zz); }
1594: VecGetArrayRead(xx,&x);
1595: VecGetArray(zz,&z);
1597: idx = a->j;
1598: v = a->a;
1599: if (usecprow) {
1600: mbs = cprow.nrows;
1601: ii = cprow.i;
1602: ridx = cprow.rindex;
1603: } else {
1604: mbs=a->mbs;
1605: ii = a->i;
1606: xb = x;
1607: }
1609: switch (bs) {
1610: case 1:
1611: for (i=0; i<mbs; i++) {
1612: if (usecprow) xb = x + ridx[i];
1613: x1 = xb[0];
1614: ib = idx + ii[0];
1615: n = ii[1] - ii[0]; ii++;
1616: for (j=0; j<n; j++) {
1617: rval = ib[j];
1618: z[rval] += PetscConj(*v) * x1;
1619: v++;
1620: }
1621: if (!usecprow) xb++;
1622: }
1623: break;
1624: case 2:
1625: for (i=0; i<mbs; i++) {
1626: if (usecprow) xb = x + 2*ridx[i];
1627: x1 = xb[0]; x2 = xb[1];
1628: ib = idx + ii[0];
1629: n = ii[1] - ii[0]; ii++;
1630: for (j=0; j<n; j++) {
1631: rval = ib[j]*2;
1632: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1633: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1634: v += 4;
1635: }
1636: if (!usecprow) xb += 2;
1637: }
1638: break;
1639: case 3:
1640: for (i=0; i<mbs; i++) {
1641: if (usecprow) xb = x + 3*ridx[i];
1642: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1643: ib = idx + ii[0];
1644: n = ii[1] - ii[0]; ii++;
1645: for (j=0; j<n; j++) {
1646: rval = ib[j]*3;
1647: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1648: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1649: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1650: v += 9;
1651: }
1652: if (!usecprow) xb += 3;
1653: }
1654: break;
1655: case 4:
1656: for (i=0; i<mbs; i++) {
1657: if (usecprow) xb = x + 4*ridx[i];
1658: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1659: ib = idx + ii[0];
1660: n = ii[1] - ii[0]; ii++;
1661: for (j=0; j<n; j++) {
1662: rval = ib[j]*4;
1663: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1664: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1665: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1666: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1667: v += 16;
1668: }
1669: if (!usecprow) xb += 4;
1670: }
1671: break;
1672: case 5:
1673: for (i=0; i<mbs; i++) {
1674: if (usecprow) xb = x + 5*ridx[i];
1675: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1676: x4 = xb[3]; x5 = xb[4];
1677: ib = idx + ii[0];
1678: n = ii[1] - ii[0]; ii++;
1679: for (j=0; j<n; j++) {
1680: rval = ib[j]*5;
1681: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1682: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1683: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1684: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1685: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1686: v += 25;
1687: }
1688: if (!usecprow) xb += 5;
1689: }
1690: break;
1691: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1692: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1693: #if 0
1694: {
1695: PetscInt ncols,k,bs2=a->bs2;
1696: PetscScalar *work,*workt,zb;
1697: const PetscScalar *xtmp;
1698: if (!a->mult_work) {
1699: k = PetscMax(A->rmap->n,A->cmap->n);
1700: PetscMalloc1(k+1,&a->mult_work);
1701: }
1702: work = a->mult_work;
1703: xtmp = x;
1704: for (i=0; i<mbs; i++) {
1705: n = ii[1] - ii[0]; ii++;
1706: ncols = n*bs;
1707: PetscMemzero(work,ncols*sizeof(PetscScalar));
1708: if (usecprow) xtmp = x + bs*ridx[i];
1709: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1710: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1711: v += n*bs2;
1712: if (!usecprow) xtmp += bs;
1713: workt = work;
1714: for (j=0; j<n; j++) {
1715: zb = z + bs*(*idx++);
1716: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1717: workt += bs;
1718: }
1719: }
1720: }
1721: #endif
1722: }
1723: VecRestoreArrayRead(xx,&x);
1724: VecRestoreArray(zz,&z);
1725: PetscLogFlops(2.0*a->nz*a->bs2);
1726: return(0);
1727: }
1731: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1732: {
1733: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1734: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
1735: const PetscScalar *x,*xb = 0;
1736: const MatScalar *v;
1737: PetscErrorCode ierr;
1738: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1739: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1740: Mat_CompressedRow cprow = a->compressedrow;
1741: PetscBool usecprow=cprow.use;
1744: if (yy != zz) { VecCopy(yy,zz); }
1745: VecGetArrayRead(xx,&x);
1746: VecGetArray(zz,&z);
1748: idx = a->j;
1749: v = a->a;
1750: if (usecprow) {
1751: mbs = cprow.nrows;
1752: ii = cprow.i;
1753: ridx = cprow.rindex;
1754: } else {
1755: mbs=a->mbs;
1756: ii = a->i;
1757: xb = x;
1758: }
1760: switch (bs) {
1761: case 1:
1762: for (i=0; i<mbs; i++) {
1763: if (usecprow) xb = x + ridx[i];
1764: x1 = xb[0];
1765: ib = idx + ii[0];
1766: n = ii[1] - ii[0]; ii++;
1767: for (j=0; j<n; j++) {
1768: rval = ib[j];
1769: z[rval] += *v * x1;
1770: v++;
1771: }
1772: if (!usecprow) xb++;
1773: }
1774: break;
1775: case 2:
1776: for (i=0; i<mbs; i++) {
1777: if (usecprow) xb = x + 2*ridx[i];
1778: x1 = xb[0]; x2 = xb[1];
1779: ib = idx + ii[0];
1780: n = ii[1] - ii[0]; ii++;
1781: for (j=0; j<n; j++) {
1782: rval = ib[j]*2;
1783: z[rval++] += v[0]*x1 + v[1]*x2;
1784: z[rval++] += v[2]*x1 + v[3]*x2;
1785: v += 4;
1786: }
1787: if (!usecprow) xb += 2;
1788: }
1789: break;
1790: case 3:
1791: for (i=0; i<mbs; i++) {
1792: if (usecprow) xb = x + 3*ridx[i];
1793: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1794: ib = idx + ii[0];
1795: n = ii[1] - ii[0]; ii++;
1796: for (j=0; j<n; j++) {
1797: rval = ib[j]*3;
1798: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1799: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1800: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1801: v += 9;
1802: }
1803: if (!usecprow) xb += 3;
1804: }
1805: break;
1806: case 4:
1807: for (i=0; i<mbs; i++) {
1808: if (usecprow) xb = x + 4*ridx[i];
1809: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1810: ib = idx + ii[0];
1811: n = ii[1] - ii[0]; ii++;
1812: for (j=0; j<n; j++) {
1813: rval = ib[j]*4;
1814: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1815: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1816: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1817: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1818: v += 16;
1819: }
1820: if (!usecprow) xb += 4;
1821: }
1822: break;
1823: case 5:
1824: for (i=0; i<mbs; i++) {
1825: if (usecprow) xb = x + 5*ridx[i];
1826: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1827: x4 = xb[3]; x5 = xb[4];
1828: ib = idx + ii[0];
1829: n = ii[1] - ii[0]; ii++;
1830: for (j=0; j<n; j++) {
1831: rval = ib[j]*5;
1832: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1833: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1834: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1835: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1836: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1837: v += 25;
1838: }
1839: if (!usecprow) xb += 5;
1840: }
1841: break;
1842: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1843: PetscInt ncols,k;
1844: PetscScalar *work,*workt;
1845: const PetscScalar *xtmp;
1846: if (!a->mult_work) {
1847: k = PetscMax(A->rmap->n,A->cmap->n);
1848: PetscMalloc1(k+1,&a->mult_work);
1849: }
1850: work = a->mult_work;
1851: xtmp = x;
1852: for (i=0; i<mbs; i++) {
1853: n = ii[1] - ii[0]; ii++;
1854: ncols = n*bs;
1855: PetscMemzero(work,ncols*sizeof(PetscScalar));
1856: if (usecprow) xtmp = x + bs*ridx[i];
1857: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1858: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1859: v += n*bs2;
1860: if (!usecprow) xtmp += bs;
1861: workt = work;
1862: for (j=0; j<n; j++) {
1863: zb = z + bs*(*idx++);
1864: for (k=0; k<bs; k++) zb[k] += workt[k];
1865: workt += bs;
1866: }
1867: }
1868: }
1869: }
1870: VecRestoreArrayRead(xx,&x);
1871: VecRestoreArray(zz,&z);
1872: PetscLogFlops(2.0*a->nz*a->bs2);
1873: return(0);
1874: }
1878: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1879: {
1880: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1881: PetscInt totalnz = a->bs2*a->nz;
1882: PetscScalar oalpha = alpha;
1884: PetscBLASInt one = 1,tnz;
1887: PetscBLASIntCast(totalnz,&tnz);
1888: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1889: PetscLogFlops(totalnz);
1890: return(0);
1891: }
1895: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1896: {
1898: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1899: MatScalar *v = a->a;
1900: PetscReal sum = 0.0;
1901: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1904: if (type == NORM_FROBENIUS) {
1905: for (i=0; i< bs2*nz; i++) {
1906: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1907: }
1908: *norm = PetscSqrtReal(sum);
1909: } else if (type == NORM_1) { /* maximum column sum */
1910: PetscReal *tmp;
1911: PetscInt *bcol = a->j;
1912: PetscCalloc1(A->cmap->n+1,&tmp);
1913: for (i=0; i<nz; i++) {
1914: for (j=0; j<bs; j++) {
1915: k1 = bs*(*bcol) + j; /* column index */
1916: for (k=0; k<bs; k++) {
1917: tmp[k1] += PetscAbsScalar(*v); v++;
1918: }
1919: }
1920: bcol++;
1921: }
1922: *norm = 0.0;
1923: for (j=0; j<A->cmap->n; j++) {
1924: if (tmp[j] > *norm) *norm = tmp[j];
1925: }
1926: PetscFree(tmp);
1927: } else if (type == NORM_INFINITY) { /* maximum row sum */
1928: *norm = 0.0;
1929: for (k=0; k<bs; k++) {
1930: for (j=0; j<a->mbs; j++) {
1931: v = a->a + bs2*a->i[j] + k;
1932: sum = 0.0;
1933: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1934: for (k1=0; k1<bs; k1++) {
1935: sum += PetscAbsScalar(*v);
1936: v += bs;
1937: }
1938: }
1939: if (sum > *norm) *norm = sum;
1940: }
1941: }
1942: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1943: return(0);
1944: }
1949: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1950: {
1951: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1955: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1956: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1957: *flg = PETSC_FALSE;
1958: return(0);
1959: }
1961: /* if the a->i are the same */
1962: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1963: if (!*flg) return(0);
1965: /* if a->j are the same */
1966: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1967: if (!*flg) return(0);
1969: /* if a->a are the same */
1970: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1971: return(0);
1973: }
1977: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1978: {
1979: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1981: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1982: PetscScalar *x,zero = 0.0;
1983: MatScalar *aa,*aa_j;
1986: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1987: bs = A->rmap->bs;
1988: aa = a->a;
1989: ai = a->i;
1990: aj = a->j;
1991: ambs = a->mbs;
1992: bs2 = a->bs2;
1994: VecSet(v,zero);
1995: VecGetArray(v,&x);
1996: VecGetLocalSize(v,&n);
1997: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1998: for (i=0; i<ambs; i++) {
1999: for (j=ai[i]; j<ai[i+1]; j++) {
2000: if (aj[j] == i) {
2001: row = i*bs;
2002: aa_j = aa+j*bs2;
2003: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2004: break;
2005: }
2006: }
2007: }
2008: VecRestoreArray(v,&x);
2009: return(0);
2010: }
2014: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2015: {
2016: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2017: const PetscScalar *l,*r,*li,*ri;
2018: PetscScalar x;
2019: MatScalar *aa, *v;
2020: PetscErrorCode ierr;
2021: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2022: const PetscInt *ai,*aj;
2025: ai = a->i;
2026: aj = a->j;
2027: aa = a->a;
2028: m = A->rmap->n;
2029: n = A->cmap->n;
2030: bs = A->rmap->bs;
2031: mbs = a->mbs;
2032: bs2 = a->bs2;
2033: if (ll) {
2034: VecGetArrayRead(ll,&l);
2035: VecGetLocalSize(ll,&lm);
2036: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2037: for (i=0; i<mbs; i++) { /* for each block row */
2038: M = ai[i+1] - ai[i];
2039: li = l + i*bs;
2040: v = aa + bs2*ai[i];
2041: for (j=0; j<M; j++) { /* for each block */
2042: for (k=0; k<bs2; k++) {
2043: (*v++) *= li[k%bs];
2044: }
2045: }
2046: }
2047: VecRestoreArrayRead(ll,&l);
2048: PetscLogFlops(a->nz);
2049: }
2051: if (rr) {
2052: VecGetArrayRead(rr,&r);
2053: VecGetLocalSize(rr,&rn);
2054: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2055: for (i=0; i<mbs; i++) { /* for each block row */
2056: iai = ai[i];
2057: M = ai[i+1] - iai;
2058: v = aa + bs2*iai;
2059: for (j=0; j<M; j++) { /* for each block */
2060: ri = r + bs*aj[iai+j];
2061: for (k=0; k<bs; k++) {
2062: x = ri[k];
2063: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2064: v += bs;
2065: }
2066: }
2067: }
2068: VecRestoreArrayRead(rr,&r);
2069: PetscLogFlops(a->nz);
2070: }
2071: return(0);
2072: }
2077: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2078: {
2079: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2082: info->block_size = a->bs2;
2083: info->nz_allocated = a->bs2*a->maxnz;
2084: info->nz_used = a->bs2*a->nz;
2085: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2086: info->assemblies = A->num_ass;
2087: info->mallocs = A->info.mallocs;
2088: info->memory = ((PetscObject)A)->mem;
2089: if (A->factortype) {
2090: info->fill_ratio_given = A->info.fill_ratio_given;
2091: info->fill_ratio_needed = A->info.fill_ratio_needed;
2092: info->factor_mallocs = A->info.factor_mallocs;
2093: } else {
2094: info->fill_ratio_given = 0;
2095: info->fill_ratio_needed = 0;
2096: info->factor_mallocs = 0;
2097: }
2098: return(0);
2099: }
2103: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2104: {
2105: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2109: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2110: return(0);
2111: }