Actual source code: baij2.c
petsc-3.7.3 2016-08-01
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
14: const PetscInt *idx;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,&table);
27: PetscMalloc1(m+1,&nidx);
28: PetscMalloc1(A->rmap->N+1,&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n; ++j) {
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
89: ISGetIndices(isrow,&irow);
90: ISGetIndices(iscol,&icol);
91: ISGetLocalSize(isrow,&nrows);
92: ISGetLocalSize(iscol,&ncols);
94: PetscCalloc1(1+oldcols,&smap);
95: ssmap = smap;
96: PetscMalloc1(1+nrows,&lens);
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) lens[i]++;
105: }
106: }
107: /* Create and fill new matrix */
108: if (scall == MAT_REUSE_MATRIX) {
109: c = (Mat_SeqBAIJ*)((*B)->data);
111: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
113: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
115: C = *B;
116: } else {
117: MatCreate(PetscObjectComm((PetscObject)A),&C);
118: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119: MatSetType(C,((PetscObject)A)->type_name);
120: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
121: }
122: c = (Mat_SeqBAIJ*)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140: /* sort */
141: {
142: MatScalar *work;
143: PetscMalloc1(bs2,&work);
144: for (i=0; i<nrows; i++) {
145: PetscInt ilen;
146: mat_i = c->i[i];
147: mat_j = c->j + mat_i;
148: mat_a = c->a + mat_i*bs2;
149: ilen = c->ilen[i];
150: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151: }
152: PetscFree(work);
153: }
155: /* Free work space */
156: ISRestoreIndices(iscol,&icol);
157: PetscFree(smap);
158: PetscFree(lens);
159: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
162: ISRestoreIndices(isrow,&irow);
163: *B = C;
164: return(0);
165: }
169: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
170: {
171: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
172: IS is1,is2;
174: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
175: const PetscInt *irow,*icol;
178: ISGetIndices(isrow,&irow);
179: ISGetIndices(iscol,&icol);
180: ISGetLocalSize(isrow,&nrows);
181: ISGetLocalSize(iscol,&ncols);
183: /* Verify if the indices corespond to each element in a block
184: and form the IS with compressed IS */
185: maxmnbs = PetscMax(a->mbs,a->nbs);
186: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
187: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
188: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
189: for (i=0; i<a->mbs; i++) {
190: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
191: }
192: count = 0;
193: for (i=0; i<nrows; i++) {
194: j = irow[i] / bs;
195: if ((vary[j]--)==bs) iary[count++] = j;
196: }
197: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
199: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
200: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
201: for (i=0; i<a->nbs; i++) {
202: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
203: }
204: count = 0;
205: for (i=0; i<ncols; i++) {
206: j = icol[i] / bs;
207: if ((vary[j]--)==bs) iary[count++] = j;
208: }
209: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
210: ISRestoreIndices(isrow,&irow);
211: ISRestoreIndices(iscol,&icol);
212: PetscFree2(vary,iary);
214: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
215: ISDestroy(&is1);
216: ISDestroy(&is2);
217: return(0);
218: }
222: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
223: {
225: PetscInt i;
228: if (scall == MAT_INITIAL_MATRIX) {
229: PetscMalloc1(n+1,B);
230: }
232: for (i=0; i<n; i++) {
233: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
234: }
235: return(0);
236: }
239: /* -------------------------------------------------------*/
240: /* Should check that shapes of vectors and matrices match */
241: /* -------------------------------------------------------*/
245: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
246: {
247: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
248: PetscScalar *z,sum;
249: const PetscScalar *x;
250: const MatScalar *v;
251: PetscErrorCode ierr;
252: PetscInt mbs,i,n;
253: const PetscInt *idx,*ii,*ridx=NULL;
254: PetscBool usecprow=a->compressedrow.use;
257: VecGetArrayRead(xx,&x);
258: VecGetArray(zz,&z);
260: if (usecprow) {
261: mbs = a->compressedrow.nrows;
262: ii = a->compressedrow.i;
263: ridx = a->compressedrow.rindex;
264: PetscMemzero(z,mbs*sizeof(PetscScalar));
265: } else {
266: mbs = a->mbs;
267: ii = a->i;
268: }
270: for (i=0; i<mbs; i++) {
271: n = ii[1] - ii[0];
272: v = a->a + ii[0];
273: idx = a->j + ii[0];
274: ii++;
275: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
276: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
277: sum = 0.0;
278: PetscSparseDensePlusDot(sum,x,v,idx,n);
279: if (usecprow) {
280: z[ridx[i]] = sum;
281: } else {
282: z[i] = sum;
283: }
284: }
285: VecRestoreArrayRead(xx,&x);
286: VecRestoreArray(zz,&z);
287: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
288: return(0);
289: }
293: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
294: {
295: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
296: PetscScalar *z = 0,sum1,sum2,*zarray;
297: const PetscScalar *x,*xb;
298: PetscScalar x1,x2;
299: const MatScalar *v;
300: PetscErrorCode ierr;
301: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
302: PetscBool usecprow=a->compressedrow.use;
305: VecGetArrayRead(xx,&x);
306: VecGetArray(zz,&zarray);
308: idx = a->j;
309: v = a->a;
310: if (usecprow) {
311: mbs = a->compressedrow.nrows;
312: ii = a->compressedrow.i;
313: ridx = a->compressedrow.rindex;
314: } else {
315: mbs = a->mbs;
316: ii = a->i;
317: z = zarray;
318: }
320: for (i=0; i<mbs; i++) {
321: n = ii[1] - ii[0]; ii++;
322: sum1 = 0.0; sum2 = 0.0;
323: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
324: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
325: for (j=0; j<n; j++) {
326: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
327: sum1 += v[0]*x1 + v[2]*x2;
328: sum2 += v[1]*x1 + v[3]*x2;
329: v += 4;
330: }
331: if (usecprow) z = zarray + 2*ridx[i];
332: z[0] = sum1; z[1] = sum2;
333: if (!usecprow) z += 2;
334: }
335: VecRestoreArrayRead(xx,&x);
336: VecRestoreArray(zz,&zarray);
337: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
338: return(0);
339: }
343: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
344: {
345: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
346: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
347: const PetscScalar *x,*xb;
348: const MatScalar *v;
349: PetscErrorCode ierr;
350: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
351: PetscBool usecprow=a->compressedrow.use;
353: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
354: #pragma disjoint(*v,*z,*xb)
355: #endif
358: VecGetArrayRead(xx,&x);
359: VecGetArray(zz,&zarray);
361: idx = a->j;
362: v = a->a;
363: if (usecprow) {
364: mbs = a->compressedrow.nrows;
365: ii = a->compressedrow.i;
366: ridx = a->compressedrow.rindex;
367: } else {
368: mbs = a->mbs;
369: ii = a->i;
370: z = zarray;
371: }
373: for (i=0; i<mbs; i++) {
374: n = ii[1] - ii[0]; ii++;
375: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
376: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
377: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378: for (j=0; j<n; j++) {
379: xb = x + 3*(*idx++);
380: x1 = xb[0];
381: x2 = xb[1];
382: x3 = xb[2];
384: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
385: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
386: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
387: v += 9;
388: }
389: if (usecprow) z = zarray + 3*ridx[i];
390: z[0] = sum1; z[1] = sum2; z[2] = sum3;
391: if (!usecprow) z += 3;
392: }
393: VecRestoreArrayRead(xx,&x);
394: VecRestoreArray(zz,&zarray);
395: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
396: return(0);
397: }
401: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
402: {
403: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
404: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
405: const PetscScalar *x,*xb;
406: const MatScalar *v;
407: PetscErrorCode ierr;
408: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
409: PetscBool usecprow=a->compressedrow.use;
412: VecGetArrayRead(xx,&x);
413: VecGetArray(zz,&zarray);
415: idx = a->j;
416: v = a->a;
417: if (usecprow) {
418: mbs = a->compressedrow.nrows;
419: ii = a->compressedrow.i;
420: ridx = a->compressedrow.rindex;
421: } else {
422: mbs = a->mbs;
423: ii = a->i;
424: z = zarray;
425: }
427: for (i=0; i<mbs; i++) {
428: n = ii[1] - ii[0];
429: ii++;
430: sum1 = 0.0;
431: sum2 = 0.0;
432: sum3 = 0.0;
433: sum4 = 0.0;
435: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
436: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437: for (j=0; j<n; j++) {
438: xb = x + 4*(*idx++);
439: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
440: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
441: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
442: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
443: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
444: v += 16;
445: }
446: if (usecprow) z = zarray + 4*ridx[i];
447: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
448: if (!usecprow) z += 4;
449: }
450: VecRestoreArrayRead(xx,&x);
451: VecRestoreArray(zz,&zarray);
452: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
453: return(0);
454: }
458: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
459: {
460: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
461: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
462: const PetscScalar *xb,*x;
463: const MatScalar *v;
464: PetscErrorCode ierr;
465: const PetscInt *idx,*ii,*ridx=NULL;
466: PetscInt mbs,i,j,n;
467: PetscBool usecprow=a->compressedrow.use;
470: VecGetArrayRead(xx,&x);
471: VecGetArray(zz,&zarray);
473: idx = a->j;
474: v = a->a;
475: if (usecprow) {
476: mbs = a->compressedrow.nrows;
477: ii = a->compressedrow.i;
478: ridx = a->compressedrow.rindex;
479: } else {
480: mbs = a->mbs;
481: ii = a->i;
482: z = zarray;
483: }
485: for (i=0; i<mbs; i++) {
486: n = ii[1] - ii[0]; ii++;
487: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
488: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
489: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
490: for (j=0; j<n; j++) {
491: xb = x + 5*(*idx++);
492: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
493: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
494: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
495: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
496: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
497: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
498: v += 25;
499: }
500: if (usecprow) z = zarray + 5*ridx[i];
501: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
502: if (!usecprow) z += 5;
503: }
504: VecRestoreArrayRead(xx,&x);
505: VecRestoreArray(zz,&zarray);
506: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
507: return(0);
508: }
514: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
517: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
518: const PetscScalar *x,*xb;
519: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
520: const MatScalar *v;
521: PetscErrorCode ierr;
522: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
523: PetscBool usecprow=a->compressedrow.use;
526: VecGetArrayRead(xx,&x);
527: VecGetArray(zz,&zarray);
529: idx = a->j;
530: v = a->a;
531: if (usecprow) {
532: mbs = a->compressedrow.nrows;
533: ii = a->compressedrow.i;
534: ridx = a->compressedrow.rindex;
535: } else {
536: mbs = a->mbs;
537: ii = a->i;
538: z = zarray;
539: }
541: for (i=0; i<mbs; i++) {
542: n = ii[1] - ii[0];
543: ii++;
544: sum1 = 0.0;
545: sum2 = 0.0;
546: sum3 = 0.0;
547: sum4 = 0.0;
548: sum5 = 0.0;
549: sum6 = 0.0;
551: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
552: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
553: for (j=0; j<n; j++) {
554: xb = x + 6*(*idx++);
555: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
556: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
557: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
558: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
559: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
560: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
561: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
562: v += 36;
563: }
564: if (usecprow) z = zarray + 6*ridx[i];
565: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
566: if (!usecprow) z += 6;
567: }
569: VecRestoreArrayRead(xx,&x);
570: VecRestoreArray(zz,&zarray);
571: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
572: return(0);
573: }
577: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
578: {
579: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
580: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
581: const PetscScalar *x,*xb;
582: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
583: const MatScalar *v;
584: PetscErrorCode ierr;
585: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
586: PetscBool usecprow=a->compressedrow.use;
589: VecGetArrayRead(xx,&x);
590: VecGetArray(zz,&zarray);
592: idx = a->j;
593: v = a->a;
594: if (usecprow) {
595: mbs = a->compressedrow.nrows;
596: ii = a->compressedrow.i;
597: ridx = a->compressedrow.rindex;
598: } else {
599: mbs = a->mbs;
600: ii = a->i;
601: z = zarray;
602: }
604: for (i=0; i<mbs; i++) {
605: n = ii[1] - ii[0];
606: ii++;
607: sum1 = 0.0;
608: sum2 = 0.0;
609: sum3 = 0.0;
610: sum4 = 0.0;
611: sum5 = 0.0;
612: sum6 = 0.0;
613: sum7 = 0.0;
615: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
616: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
617: for (j=0; j<n; j++) {
618: xb = x + 7*(*idx++);
619: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
620: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
621: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
622: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
623: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
624: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
625: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
626: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
627: v += 49;
628: }
629: if (usecprow) z = zarray + 7*ridx[i];
630: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
631: if (!usecprow) z += 7;
632: }
634: VecRestoreArrayRead(xx,&x);
635: VecRestoreArray(zz,&zarray);
636: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
637: return(0);
638: }
640: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
641: /* Default MatMult for block size 15 */
645: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
646: {
647: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
648: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
649: const PetscScalar *x,*xb;
650: PetscScalar *zarray,xv;
651: const MatScalar *v;
652: PetscErrorCode ierr;
653: const PetscInt *ii,*ij=a->j,*idx;
654: PetscInt mbs,i,j,k,n,*ridx=NULL;
655: PetscBool usecprow=a->compressedrow.use;
658: VecGetArrayRead(xx,&x);
659: VecGetArray(zz,&zarray);
661: v = a->a;
662: if (usecprow) {
663: mbs = a->compressedrow.nrows;
664: ii = a->compressedrow.i;
665: ridx = a->compressedrow.rindex;
666: } else {
667: mbs = a->mbs;
668: ii = a->i;
669: z = zarray;
670: }
672: for (i=0; i<mbs; i++) {
673: n = ii[i+1] - ii[i];
674: idx = ij + ii[i];
675: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
676: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
678: for (j=0; j<n; j++) {
679: xb = x + 15*(idx[j]);
681: for (k=0; k<15; k++) {
682: xv = xb[k];
683: sum1 += v[0]*xv;
684: sum2 += v[1]*xv;
685: sum3 += v[2]*xv;
686: sum4 += v[3]*xv;
687: sum5 += v[4]*xv;
688: sum6 += v[5]*xv;
689: sum7 += v[6]*xv;
690: sum8 += v[7]*xv;
691: sum9 += v[8]*xv;
692: sum10 += v[9]*xv;
693: sum11 += v[10]*xv;
694: sum12 += v[11]*xv;
695: sum13 += v[12]*xv;
696: sum14 += v[13]*xv;
697: sum15 += v[14]*xv;
698: v += 15;
699: }
700: }
701: if (usecprow) z = zarray + 15*ridx[i];
702: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
703: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
705: if (!usecprow) z += 15;
706: }
708: VecRestoreArrayRead(xx,&x);
709: VecRestoreArray(zz,&zarray);
710: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
711: return(0);
712: }
714: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
717: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
718: {
719: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
720: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
721: const PetscScalar *x,*xb;
722: PetscScalar x1,x2,x3,x4,*zarray;
723: const MatScalar *v;
724: PetscErrorCode ierr;
725: const PetscInt *ii,*ij=a->j,*idx;
726: PetscInt mbs,i,j,n,*ridx=NULL;
727: PetscBool usecprow=a->compressedrow.use;
730: VecGetArrayRead(xx,&x);
731: VecGetArray(zz,&zarray);
733: v = a->a;
734: if (usecprow) {
735: mbs = a->compressedrow.nrows;
736: ii = a->compressedrow.i;
737: ridx = a->compressedrow.rindex;
738: } else {
739: mbs = a->mbs;
740: ii = a->i;
741: z = zarray;
742: }
744: for (i=0; i<mbs; i++) {
745: n = ii[i+1] - ii[i];
746: idx = ij + ii[i];
747: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
748: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
750: for (j=0; j<n; j++) {
751: xb = x + 15*(idx[j]);
752: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
754: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
755: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
756: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
757: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
758: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
759: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
760: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
761: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
762: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
763: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
764: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
765: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
766: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
767: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
768: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
770: v += 60;
772: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
774: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
775: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
776: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
777: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
778: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
779: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
780: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
781: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
782: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
783: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
784: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
785: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
786: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
787: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
788: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
789: v += 60;
791: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
792: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
793: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
794: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
795: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
796: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
797: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
798: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
799: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
800: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
801: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
802: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
803: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
804: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
805: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
806: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
807: v += 60;
809: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
810: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
811: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
812: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
813: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
814: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
815: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
816: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
817: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
818: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
819: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
820: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
821: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
822: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
823: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
824: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
825: v += 45;
826: }
827: if (usecprow) z = zarray + 15*ridx[i];
828: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
829: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
831: if (!usecprow) z += 15;
832: }
834: VecRestoreArrayRead(xx,&x);
835: VecRestoreArray(zz,&zarray);
836: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
837: return(0);
838: }
840: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
843: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
844: {
845: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
846: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
847: const PetscScalar *x,*xb;
848: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
849: const MatScalar *v;
850: PetscErrorCode ierr;
851: const PetscInt *ii,*ij=a->j,*idx;
852: PetscInt mbs,i,j,n,*ridx=NULL;
853: PetscBool usecprow=a->compressedrow.use;
856: VecGetArrayRead(xx,&x);
857: VecGetArray(zz,&zarray);
859: v = a->a;
860: if (usecprow) {
861: mbs = a->compressedrow.nrows;
862: ii = a->compressedrow.i;
863: ridx = a->compressedrow.rindex;
864: } else {
865: mbs = a->mbs;
866: ii = a->i;
867: z = zarray;
868: }
870: for (i=0; i<mbs; i++) {
871: n = ii[i+1] - ii[i];
872: idx = ij + ii[i];
873: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
874: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
876: for (j=0; j<n; j++) {
877: xb = x + 15*(idx[j]);
878: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
879: x8 = xb[7];
881: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
882: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
883: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
884: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
885: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
886: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
887: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
888: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
889: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
890: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
891: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
892: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
893: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
894: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
895: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
896: v += 120;
898: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
900: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
901: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
902: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
903: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
904: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
905: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
906: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
907: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
908: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
909: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
910: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
911: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
912: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
913: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
914: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
915: v += 105;
916: }
917: if (usecprow) z = zarray + 15*ridx[i];
918: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
919: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
921: if (!usecprow) z += 15;
922: }
924: VecRestoreArrayRead(xx,&x);
925: VecRestoreArray(zz,&zarray);
926: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
927: return(0);
928: }
930: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
934: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
935: {
936: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
937: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
938: const PetscScalar *x,*xb;
939: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
940: const MatScalar *v;
941: PetscErrorCode ierr;
942: const PetscInt *ii,*ij=a->j,*idx;
943: PetscInt mbs,i,j,n,*ridx=NULL;
944: PetscBool usecprow=a->compressedrow.use;
947: VecGetArrayRead(xx,&x);
948: VecGetArray(zz,&zarray);
950: v = a->a;
951: if (usecprow) {
952: mbs = a->compressedrow.nrows;
953: ii = a->compressedrow.i;
954: ridx = a->compressedrow.rindex;
955: } else {
956: mbs = a->mbs;
957: ii = a->i;
958: z = zarray;
959: }
961: for (i=0; i<mbs; i++) {
962: n = ii[i+1] - ii[i];
963: idx = ij + ii[i];
964: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
965: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
967: for (j=0; j<n; j++) {
968: xb = x + 15*(idx[j]);
969: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
970: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
972: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
973: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
974: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
975: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
976: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
977: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
978: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
979: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
980: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
981: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
982: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
983: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
984: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
985: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
986: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
987: v += 225;
988: }
989: if (usecprow) z = zarray + 15*ridx[i];
990: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
991: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
993: if (!usecprow) z += 15;
994: }
996: VecRestoreArrayRead(xx,&x);
997: VecRestoreArray(zz,&zarray);
998: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
999: return(0);
1000: }
1003: /*
1004: This will not work with MatScalar == float because it calls the BLAS
1005: */
1008: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1009: {
1010: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1011: PetscScalar *z = 0,*work,*workt,*zarray;
1012: const PetscScalar *x,*xb;
1013: const MatScalar *v;
1014: PetscErrorCode ierr;
1015: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1016: const PetscInt *idx,*ii,*ridx=NULL;
1017: PetscInt ncols,k;
1018: PetscBool usecprow=a->compressedrow.use;
1021: VecGetArrayRead(xx,&x);
1022: VecGetArray(zz,&zarray);
1024: idx = a->j;
1025: v = a->a;
1026: if (usecprow) {
1027: mbs = a->compressedrow.nrows;
1028: ii = a->compressedrow.i;
1029: ridx = a->compressedrow.rindex;
1030: } else {
1031: mbs = a->mbs;
1032: ii = a->i;
1033: z = zarray;
1034: }
1036: if (!a->mult_work) {
1037: k = PetscMax(A->rmap->n,A->cmap->n);
1038: PetscMalloc1(k+1,&a->mult_work);
1039: }
1040: work = a->mult_work;
1041: for (i=0; i<mbs; i++) {
1042: n = ii[1] - ii[0]; ii++;
1043: ncols = n*bs;
1044: workt = work;
1045: for (j=0; j<n; j++) {
1046: xb = x + bs*(*idx++);
1047: for (k=0; k<bs; k++) workt[k] = xb[k];
1048: workt += bs;
1049: }
1050: if (usecprow) z = zarray + bs*ridx[i];
1051: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1052: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1053: v += n*bs2;
1054: if (!usecprow) z += bs;
1055: }
1056: VecRestoreArrayRead(xx,&x);
1057: VecRestoreArray(zz,&zarray);
1058: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1059: return(0);
1060: }
1064: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1065: {
1066: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1067: const PetscScalar *x;
1068: PetscScalar *y,*z,sum;
1069: const MatScalar *v;
1070: PetscErrorCode ierr;
1071: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1072: const PetscInt *idx,*ii;
1073: PetscBool usecprow=a->compressedrow.use;
1076: VecGetArrayRead(xx,&x);
1077: VecGetArrayPair(yy,zz,&y,&z);
1079: idx = a->j;
1080: v = a->a;
1081: if (usecprow) {
1082: if (zz != yy) {
1083: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1084: }
1085: mbs = a->compressedrow.nrows;
1086: ii = a->compressedrow.i;
1087: ridx = a->compressedrow.rindex;
1088: } else {
1089: ii = a->i;
1090: }
1092: for (i=0; i<mbs; i++) {
1093: n = ii[1] - ii[0];
1094: ii++;
1095: if (!usecprow) {
1096: sum = y[i];
1097: } else {
1098: sum = y[ridx[i]];
1099: }
1100: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1101: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1102: PetscSparseDensePlusDot(sum,x,v,idx,n);
1103: v += n;
1104: idx += n;
1105: if (usecprow) {
1106: z[ridx[i]] = sum;
1107: } else {
1108: z[i] = sum;
1109: }
1110: }
1111: VecRestoreArrayRead(xx,&x);
1112: VecRestoreArrayPair(yy,zz,&y,&z);
1113: PetscLogFlops(2.0*a->nz);
1114: return(0);
1115: }
1119: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1120: {
1121: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1122: PetscScalar *y = 0,*z = 0,sum1,sum2;
1123: const PetscScalar *x,*xb;
1124: PetscScalar x1,x2,*yarray,*zarray;
1125: const MatScalar *v;
1126: PetscErrorCode ierr;
1127: PetscInt mbs = a->mbs,i,n,j;
1128: const PetscInt *idx,*ii,*ridx = NULL;
1129: PetscBool usecprow = a->compressedrow.use;
1132: VecGetArrayRead(xx,&x);
1133: VecGetArrayPair(yy,zz,&yarray,&zarray);
1135: idx = a->j;
1136: v = a->a;
1137: if (usecprow) {
1138: if (zz != yy) {
1139: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1140: }
1141: mbs = a->compressedrow.nrows;
1142: ii = a->compressedrow.i;
1143: ridx = a->compressedrow.rindex;
1144: if (zz != yy) {
1145: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1146: }
1147: } else {
1148: ii = a->i;
1149: y = yarray;
1150: z = zarray;
1151: }
1153: for (i=0; i<mbs; i++) {
1154: n = ii[1] - ii[0]; ii++;
1155: if (usecprow) {
1156: z = zarray + 2*ridx[i];
1157: y = yarray + 2*ridx[i];
1158: }
1159: sum1 = y[0]; sum2 = y[1];
1160: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1161: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1162: for (j=0; j<n; j++) {
1163: xb = x + 2*(*idx++);
1164: x1 = xb[0];
1165: x2 = xb[1];
1167: sum1 += v[0]*x1 + v[2]*x2;
1168: sum2 += v[1]*x1 + v[3]*x2;
1169: v += 4;
1170: }
1171: z[0] = sum1; z[1] = sum2;
1172: if (!usecprow) {
1173: z += 2; y += 2;
1174: }
1175: }
1176: VecRestoreArrayRead(xx,&x);
1177: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1178: PetscLogFlops(4.0*a->nz);
1179: return(0);
1180: }
1184: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1185: {
1186: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1187: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1188: const PetscScalar *x,*xb;
1189: const MatScalar *v;
1190: PetscErrorCode ierr;
1191: PetscInt mbs = a->mbs,i,j,n;
1192: const PetscInt *idx,*ii,*ridx = NULL;
1193: PetscBool usecprow = a->compressedrow.use;
1196: VecGetArrayRead(xx,&x);
1197: VecGetArrayPair(yy,zz,&yarray,&zarray);
1199: idx = a->j;
1200: v = a->a;
1201: if (usecprow) {
1202: if (zz != yy) {
1203: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1204: }
1205: mbs = a->compressedrow.nrows;
1206: ii = a->compressedrow.i;
1207: ridx = a->compressedrow.rindex;
1208: } else {
1209: ii = a->i;
1210: y = yarray;
1211: z = zarray;
1212: }
1214: for (i=0; i<mbs; i++) {
1215: n = ii[1] - ii[0]; ii++;
1216: if (usecprow) {
1217: z = zarray + 3*ridx[i];
1218: y = yarray + 3*ridx[i];
1219: }
1220: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1221: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1222: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1223: for (j=0; j<n; j++) {
1224: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1225: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1226: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1227: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1228: v += 9;
1229: }
1230: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1231: if (!usecprow) {
1232: z += 3; y += 3;
1233: }
1234: }
1235: VecRestoreArrayRead(xx,&x);
1236: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1237: PetscLogFlops(18.0*a->nz);
1238: return(0);
1239: }
1243: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1244: {
1245: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1246: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1247: const PetscScalar *x,*xb;
1248: const MatScalar *v;
1249: PetscErrorCode ierr;
1250: PetscInt mbs = a->mbs,i,j,n;
1251: const PetscInt *idx,*ii,*ridx=NULL;
1252: PetscBool usecprow=a->compressedrow.use;
1255: VecGetArrayRead(xx,&x);
1256: VecGetArrayPair(yy,zz,&yarray,&zarray);
1258: idx = a->j;
1259: v = a->a;
1260: if (usecprow) {
1261: if (zz != yy) {
1262: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1263: }
1264: mbs = a->compressedrow.nrows;
1265: ii = a->compressedrow.i;
1266: ridx = a->compressedrow.rindex;
1267: } else {
1268: ii = a->i;
1269: y = yarray;
1270: z = zarray;
1271: }
1273: for (i=0; i<mbs; i++) {
1274: n = ii[1] - ii[0]; ii++;
1275: if (usecprow) {
1276: z = zarray + 4*ridx[i];
1277: y = yarray + 4*ridx[i];
1278: }
1279: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1280: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1281: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1282: for (j=0; j<n; j++) {
1283: xb = x + 4*(*idx++);
1284: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1285: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1286: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1287: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1288: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1289: v += 16;
1290: }
1291: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1292: if (!usecprow) {
1293: z += 4; y += 4;
1294: }
1295: }
1296: VecRestoreArrayRead(xx,&x);
1297: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1298: PetscLogFlops(32.0*a->nz);
1299: return(0);
1300: }
1304: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1305: {
1306: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1307: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1308: const PetscScalar *x,*xb;
1309: PetscScalar *yarray,*zarray;
1310: const MatScalar *v;
1311: PetscErrorCode ierr;
1312: PetscInt mbs = a->mbs,i,j,n;
1313: const PetscInt *idx,*ii,*ridx = NULL;
1314: PetscBool usecprow=a->compressedrow.use;
1317: VecGetArrayRead(xx,&x);
1318: VecGetArrayPair(yy,zz,&yarray,&zarray);
1320: idx = a->j;
1321: v = a->a;
1322: if (usecprow) {
1323: if (zz != yy) {
1324: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1325: }
1326: mbs = a->compressedrow.nrows;
1327: ii = a->compressedrow.i;
1328: ridx = a->compressedrow.rindex;
1329: } else {
1330: ii = a->i;
1331: y = yarray;
1332: z = zarray;
1333: }
1335: for (i=0; i<mbs; i++) {
1336: n = ii[1] - ii[0]; ii++;
1337: if (usecprow) {
1338: z = zarray + 5*ridx[i];
1339: y = yarray + 5*ridx[i];
1340: }
1341: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1342: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1343: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1344: for (j=0; j<n; j++) {
1345: xb = x + 5*(*idx++);
1346: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1347: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1348: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1349: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1350: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1351: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1352: v += 25;
1353: }
1354: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1355: if (!usecprow) {
1356: z += 5; y += 5;
1357: }
1358: }
1359: VecRestoreArrayRead(xx,&x);
1360: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1361: PetscLogFlops(50.0*a->nz);
1362: return(0);
1363: }
1366: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1367: {
1368: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1369: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1370: const PetscScalar *x,*xb;
1371: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1372: const MatScalar *v;
1373: PetscErrorCode ierr;
1374: PetscInt mbs = a->mbs,i,j,n;
1375: const PetscInt *idx,*ii,*ridx=NULL;
1376: PetscBool usecprow=a->compressedrow.use;
1379: VecGetArrayRead(xx,&x);
1380: VecGetArrayPair(yy,zz,&yarray,&zarray);
1382: idx = a->j;
1383: v = a->a;
1384: if (usecprow) {
1385: if (zz != yy) {
1386: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1387: }
1388: mbs = a->compressedrow.nrows;
1389: ii = a->compressedrow.i;
1390: ridx = a->compressedrow.rindex;
1391: } else {
1392: ii = a->i;
1393: y = yarray;
1394: z = zarray;
1395: }
1397: for (i=0; i<mbs; i++) {
1398: n = ii[1] - ii[0]; ii++;
1399: if (usecprow) {
1400: z = zarray + 6*ridx[i];
1401: y = yarray + 6*ridx[i];
1402: }
1403: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1404: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1405: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1406: for (j=0; j<n; j++) {
1407: xb = x + 6*(*idx++);
1408: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1409: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415: v += 36;
1416: }
1417: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1418: if (!usecprow) {
1419: z += 6; y += 6;
1420: }
1421: }
1422: VecRestoreArrayRead(xx,&x);
1423: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1424: PetscLogFlops(72.0*a->nz);
1425: return(0);
1426: }
1430: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1431: {
1432: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1433: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1434: const PetscScalar *x,*xb;
1435: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1436: const MatScalar *v;
1437: PetscErrorCode ierr;
1438: PetscInt mbs = a->mbs,i,j,n;
1439: const PetscInt *idx,*ii,*ridx = NULL;
1440: PetscBool usecprow=a->compressedrow.use;
1443: VecGetArrayRead(xx,&x);
1444: VecGetArrayPair(yy,zz,&yarray,&zarray);
1446: idx = a->j;
1447: v = a->a;
1448: if (usecprow) {
1449: if (zz != yy) {
1450: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1451: }
1452: mbs = a->compressedrow.nrows;
1453: ii = a->compressedrow.i;
1454: ridx = a->compressedrow.rindex;
1455: } else {
1456: ii = a->i;
1457: y = yarray;
1458: z = zarray;
1459: }
1461: for (i=0; i<mbs; i++) {
1462: n = ii[1] - ii[0]; ii++;
1463: if (usecprow) {
1464: z = zarray + 7*ridx[i];
1465: y = yarray + 7*ridx[i];
1466: }
1467: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1468: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1469: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1470: for (j=0; j<n; j++) {
1471: xb = x + 7*(*idx++);
1472: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1473: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1474: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1475: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1476: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1477: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1478: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1479: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1480: v += 49;
1481: }
1482: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1483: if (!usecprow) {
1484: z += 7; y += 7;
1485: }
1486: }
1487: VecRestoreArrayRead(xx,&x);
1488: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1489: PetscLogFlops(98.0*a->nz);
1490: return(0);
1491: }
1495: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1496: {
1497: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1498: PetscScalar *z = 0,*work,*workt,*zarray;
1499: const PetscScalar *x,*xb;
1500: const MatScalar *v;
1501: PetscErrorCode ierr;
1502: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1503: PetscInt ncols,k;
1504: const PetscInt *ridx = NULL,*idx,*ii;
1505: PetscBool usecprow = a->compressedrow.use;
1508: VecCopy(yy,zz);
1509: VecGetArrayRead(xx,&x);
1510: VecGetArray(zz,&zarray);
1512: idx = a->j;
1513: v = a->a;
1514: if (usecprow) {
1515: mbs = a->compressedrow.nrows;
1516: ii = a->compressedrow.i;
1517: ridx = a->compressedrow.rindex;
1518: } else {
1519: mbs = a->mbs;
1520: ii = a->i;
1521: z = zarray;
1522: }
1524: if (!a->mult_work) {
1525: k = PetscMax(A->rmap->n,A->cmap->n);
1526: PetscMalloc1(k+1,&a->mult_work);
1527: }
1528: work = a->mult_work;
1529: for (i=0; i<mbs; i++) {
1530: n = ii[1] - ii[0]; ii++;
1531: ncols = n*bs;
1532: workt = work;
1533: for (j=0; j<n; j++) {
1534: xb = x + bs*(*idx++);
1535: for (k=0; k<bs; k++) workt[k] = xb[k];
1536: workt += bs;
1537: }
1538: if (usecprow) z = zarray + bs*ridx[i];
1539: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1540: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1541: v += n*bs2;
1542: if (!usecprow) z += bs;
1543: }
1544: VecRestoreArrayRead(xx,&x);
1545: VecRestoreArray(zz,&zarray);
1546: PetscLogFlops(2.0*a->nz*bs2);
1547: return(0);
1548: }
1552: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1553: {
1554: PetscScalar zero = 0.0;
1558: VecSet(zz,zero);
1559: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1560: return(0);
1561: }
1565: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1566: {
1567: PetscScalar zero = 0.0;
1571: VecSet(zz,zero);
1572: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1573: return(0);
1574: }
1578: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1579: {
1580: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1581: PetscScalar *z,x1,x2,x3,x4,x5;
1582: const PetscScalar *x,*xb = NULL;
1583: const MatScalar *v;
1584: PetscErrorCode ierr;
1585: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1586: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1587: Mat_CompressedRow cprow = a->compressedrow;
1588: PetscBool usecprow = cprow.use;
1591: if (yy != zz) { VecCopy(yy,zz); }
1592: VecGetArrayRead(xx,&x);
1593: VecGetArray(zz,&z);
1595: idx = a->j;
1596: v = a->a;
1597: if (usecprow) {
1598: mbs = cprow.nrows;
1599: ii = cprow.i;
1600: ridx = cprow.rindex;
1601: } else {
1602: mbs=a->mbs;
1603: ii = a->i;
1604: xb = x;
1605: }
1607: switch (bs) {
1608: case 1:
1609: for (i=0; i<mbs; i++) {
1610: if (usecprow) xb = x + ridx[i];
1611: x1 = xb[0];
1612: ib = idx + ii[0];
1613: n = ii[1] - ii[0]; ii++;
1614: for (j=0; j<n; j++) {
1615: rval = ib[j];
1616: z[rval] += PetscConj(*v) * x1;
1617: v++;
1618: }
1619: if (!usecprow) xb++;
1620: }
1621: break;
1622: case 2:
1623: for (i=0; i<mbs; i++) {
1624: if (usecprow) xb = x + 2*ridx[i];
1625: x1 = xb[0]; x2 = xb[1];
1626: ib = idx + ii[0];
1627: n = ii[1] - ii[0]; ii++;
1628: for (j=0; j<n; j++) {
1629: rval = ib[j]*2;
1630: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1631: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1632: v += 4;
1633: }
1634: if (!usecprow) xb += 2;
1635: }
1636: break;
1637: case 3:
1638: for (i=0; i<mbs; i++) {
1639: if (usecprow) xb = x + 3*ridx[i];
1640: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1641: ib = idx + ii[0];
1642: n = ii[1] - ii[0]; ii++;
1643: for (j=0; j<n; j++) {
1644: rval = ib[j]*3;
1645: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1646: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1647: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1648: v += 9;
1649: }
1650: if (!usecprow) xb += 3;
1651: }
1652: break;
1653: case 4:
1654: for (i=0; i<mbs; i++) {
1655: if (usecprow) xb = x + 4*ridx[i];
1656: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1657: ib = idx + ii[0];
1658: n = ii[1] - ii[0]; ii++;
1659: for (j=0; j<n; j++) {
1660: rval = ib[j]*4;
1661: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1662: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1663: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1664: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1665: v += 16;
1666: }
1667: if (!usecprow) xb += 4;
1668: }
1669: break;
1670: case 5:
1671: for (i=0; i<mbs; i++) {
1672: if (usecprow) xb = x + 5*ridx[i];
1673: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1674: x4 = xb[3]; x5 = xb[4];
1675: ib = idx + ii[0];
1676: n = ii[1] - ii[0]; ii++;
1677: for (j=0; j<n; j++) {
1678: rval = ib[j]*5;
1679: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1680: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1681: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1682: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1683: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1684: v += 25;
1685: }
1686: if (!usecprow) xb += 5;
1687: }
1688: break;
1689: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1690: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1691: #if 0
1692: {
1693: PetscInt ncols,k,bs2=a->bs2;
1694: PetscScalar *work,*workt,zb;
1695: const PetscScalar *xtmp;
1696: if (!a->mult_work) {
1697: k = PetscMax(A->rmap->n,A->cmap->n);
1698: PetscMalloc1(k+1,&a->mult_work);
1699: }
1700: work = a->mult_work;
1701: xtmp = x;
1702: for (i=0; i<mbs; i++) {
1703: n = ii[1] - ii[0]; ii++;
1704: ncols = n*bs;
1705: PetscMemzero(work,ncols*sizeof(PetscScalar));
1706: if (usecprow) xtmp = x + bs*ridx[i];
1707: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1708: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1709: v += n*bs2;
1710: if (!usecprow) xtmp += bs;
1711: workt = work;
1712: for (j=0; j<n; j++) {
1713: zb = z + bs*(*idx++);
1714: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1715: workt += bs;
1716: }
1717: }
1718: }
1719: #endif
1720: }
1721: VecRestoreArrayRead(xx,&x);
1722: VecRestoreArray(zz,&z);
1723: PetscLogFlops(2.0*a->nz*a->bs2);
1724: return(0);
1725: }
1729: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1730: {
1731: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1732: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
1733: const PetscScalar *x,*xb = 0;
1734: const MatScalar *v;
1735: PetscErrorCode ierr;
1736: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1737: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1738: Mat_CompressedRow cprow = a->compressedrow;
1739: PetscBool usecprow=cprow.use;
1742: if (yy != zz) { VecCopy(yy,zz); }
1743: VecGetArrayRead(xx,&x);
1744: VecGetArray(zz,&z);
1746: idx = a->j;
1747: v = a->a;
1748: if (usecprow) {
1749: mbs = cprow.nrows;
1750: ii = cprow.i;
1751: ridx = cprow.rindex;
1752: } else {
1753: mbs=a->mbs;
1754: ii = a->i;
1755: xb = x;
1756: }
1758: switch (bs) {
1759: case 1:
1760: for (i=0; i<mbs; i++) {
1761: if (usecprow) xb = x + ridx[i];
1762: x1 = xb[0];
1763: ib = idx + ii[0];
1764: n = ii[1] - ii[0]; ii++;
1765: for (j=0; j<n; j++) {
1766: rval = ib[j];
1767: z[rval] += *v * x1;
1768: v++;
1769: }
1770: if (!usecprow) xb++;
1771: }
1772: break;
1773: case 2:
1774: for (i=0; i<mbs; i++) {
1775: if (usecprow) xb = x + 2*ridx[i];
1776: x1 = xb[0]; x2 = xb[1];
1777: ib = idx + ii[0];
1778: n = ii[1] - ii[0]; ii++;
1779: for (j=0; j<n; j++) {
1780: rval = ib[j]*2;
1781: z[rval++] += v[0]*x1 + v[1]*x2;
1782: z[rval++] += v[2]*x1 + v[3]*x2;
1783: v += 4;
1784: }
1785: if (!usecprow) xb += 2;
1786: }
1787: break;
1788: case 3:
1789: for (i=0; i<mbs; i++) {
1790: if (usecprow) xb = x + 3*ridx[i];
1791: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1792: ib = idx + ii[0];
1793: n = ii[1] - ii[0]; ii++;
1794: for (j=0; j<n; j++) {
1795: rval = ib[j]*3;
1796: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1797: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1798: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1799: v += 9;
1800: }
1801: if (!usecprow) xb += 3;
1802: }
1803: break;
1804: case 4:
1805: for (i=0; i<mbs; i++) {
1806: if (usecprow) xb = x + 4*ridx[i];
1807: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1808: ib = idx + ii[0];
1809: n = ii[1] - ii[0]; ii++;
1810: for (j=0; j<n; j++) {
1811: rval = ib[j]*4;
1812: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1813: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1814: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1815: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1816: v += 16;
1817: }
1818: if (!usecprow) xb += 4;
1819: }
1820: break;
1821: case 5:
1822: for (i=0; i<mbs; i++) {
1823: if (usecprow) xb = x + 5*ridx[i];
1824: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1825: x4 = xb[3]; x5 = xb[4];
1826: ib = idx + ii[0];
1827: n = ii[1] - ii[0]; ii++;
1828: for (j=0; j<n; j++) {
1829: rval = ib[j]*5;
1830: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1831: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1832: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1833: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1834: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1835: v += 25;
1836: }
1837: if (!usecprow) xb += 5;
1838: }
1839: break;
1840: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1841: PetscInt ncols,k;
1842: PetscScalar *work,*workt;
1843: const PetscScalar *xtmp;
1844: if (!a->mult_work) {
1845: k = PetscMax(A->rmap->n,A->cmap->n);
1846: PetscMalloc1(k+1,&a->mult_work);
1847: }
1848: work = a->mult_work;
1849: xtmp = x;
1850: for (i=0; i<mbs; i++) {
1851: n = ii[1] - ii[0]; ii++;
1852: ncols = n*bs;
1853: PetscMemzero(work,ncols*sizeof(PetscScalar));
1854: if (usecprow) xtmp = x + bs*ridx[i];
1855: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1856: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1857: v += n*bs2;
1858: if (!usecprow) xtmp += bs;
1859: workt = work;
1860: for (j=0; j<n; j++) {
1861: zb = z + bs*(*idx++);
1862: for (k=0; k<bs; k++) zb[k] += workt[k];
1863: workt += bs;
1864: }
1865: }
1866: }
1867: }
1868: VecRestoreArrayRead(xx,&x);
1869: VecRestoreArray(zz,&z);
1870: PetscLogFlops(2.0*a->nz*a->bs2);
1871: return(0);
1872: }
1876: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1877: {
1878: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1879: PetscInt totalnz = a->bs2*a->nz;
1880: PetscScalar oalpha = alpha;
1882: PetscBLASInt one = 1,tnz;
1885: PetscBLASIntCast(totalnz,&tnz);
1886: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1887: PetscLogFlops(totalnz);
1888: return(0);
1889: }
1893: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1894: {
1896: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1897: MatScalar *v = a->a;
1898: PetscReal sum = 0.0;
1899: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1902: if (type == NORM_FROBENIUS) {
1903: for (i=0; i< bs2*nz; i++) {
1904: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1905: }
1906: *norm = PetscSqrtReal(sum);
1907: PetscLogFlops(2*bs2*nz);
1908: } else if (type == NORM_1) { /* maximum column sum */
1909: PetscReal *tmp;
1910: PetscInt *bcol = a->j;
1911: PetscCalloc1(A->cmap->n+1,&tmp);
1912: for (i=0; i<nz; i++) {
1913: for (j=0; j<bs; j++) {
1914: k1 = bs*(*bcol) + j; /* column index */
1915: for (k=0; k<bs; k++) {
1916: tmp[k1] += PetscAbsScalar(*v); v++;
1917: }
1918: }
1919: bcol++;
1920: }
1921: *norm = 0.0;
1922: for (j=0; j<A->cmap->n; j++) {
1923: if (tmp[j] > *norm) *norm = tmp[j];
1924: }
1925: PetscFree(tmp);
1926: PetscLogFlops(PetscMax(bs2*nz-1,0));
1927: } else if (type == NORM_INFINITY) { /* maximum row sum */
1928: *norm = 0.0;
1929: for (k=0; k<bs; k++) {
1930: for (j=0; j<a->mbs; j++) {
1931: v = a->a + bs2*a->i[j] + k;
1932: sum = 0.0;
1933: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1934: for (k1=0; k1<bs; k1++) {
1935: sum += PetscAbsScalar(*v);
1936: v += bs;
1937: }
1938: }
1939: if (sum > *norm) *norm = sum;
1940: }
1941: }
1942: PetscLogFlops(PetscMax(bs2*nz-1,0));
1943: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1944: return(0);
1945: }
1950: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1951: {
1952: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1956: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1957: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1958: *flg = PETSC_FALSE;
1959: return(0);
1960: }
1962: /* if the a->i are the same */
1963: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1964: if (!*flg) return(0);
1966: /* if a->j are the same */
1967: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1968: if (!*flg) return(0);
1970: /* if a->a are the same */
1971: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1972: return(0);
1974: }
1978: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1979: {
1980: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1982: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1983: PetscScalar *x,zero = 0.0;
1984: MatScalar *aa,*aa_j;
1987: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1988: bs = A->rmap->bs;
1989: aa = a->a;
1990: ai = a->i;
1991: aj = a->j;
1992: ambs = a->mbs;
1993: bs2 = a->bs2;
1995: VecSet(v,zero);
1996: VecGetArray(v,&x);
1997: VecGetLocalSize(v,&n);
1998: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1999: for (i=0; i<ambs; i++) {
2000: for (j=ai[i]; j<ai[i+1]; j++) {
2001: if (aj[j] == i) {
2002: row = i*bs;
2003: aa_j = aa+j*bs2;
2004: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2005: break;
2006: }
2007: }
2008: }
2009: VecRestoreArray(v,&x);
2010: return(0);
2011: }
2015: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2016: {
2017: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2018: const PetscScalar *l,*r,*li,*ri;
2019: PetscScalar x;
2020: MatScalar *aa, *v;
2021: PetscErrorCode ierr;
2022: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2023: const PetscInt *ai,*aj;
2026: ai = a->i;
2027: aj = a->j;
2028: aa = a->a;
2029: m = A->rmap->n;
2030: n = A->cmap->n;
2031: bs = A->rmap->bs;
2032: mbs = a->mbs;
2033: bs2 = a->bs2;
2034: if (ll) {
2035: VecGetArrayRead(ll,&l);
2036: VecGetLocalSize(ll,&lm);
2037: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2038: for (i=0; i<mbs; i++) { /* for each block row */
2039: M = ai[i+1] - ai[i];
2040: li = l + i*bs;
2041: v = aa + bs2*ai[i];
2042: for (j=0; j<M; j++) { /* for each block */
2043: for (k=0; k<bs2; k++) {
2044: (*v++) *= li[k%bs];
2045: }
2046: }
2047: }
2048: VecRestoreArrayRead(ll,&l);
2049: PetscLogFlops(a->nz);
2050: }
2052: if (rr) {
2053: VecGetArrayRead(rr,&r);
2054: VecGetLocalSize(rr,&rn);
2055: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2056: for (i=0; i<mbs; i++) { /* for each block row */
2057: iai = ai[i];
2058: M = ai[i+1] - iai;
2059: v = aa + bs2*iai;
2060: for (j=0; j<M; j++) { /* for each block */
2061: ri = r + bs*aj[iai+j];
2062: for (k=0; k<bs; k++) {
2063: x = ri[k];
2064: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2065: v += bs;
2066: }
2067: }
2068: }
2069: VecRestoreArrayRead(rr,&r);
2070: PetscLogFlops(a->nz);
2071: }
2072: return(0);
2073: }
2078: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2079: {
2080: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2083: info->block_size = a->bs2;
2084: info->nz_allocated = a->bs2*a->maxnz;
2085: info->nz_used = a->bs2*a->nz;
2086: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2087: info->assemblies = A->num_ass;
2088: info->mallocs = A->info.mallocs;
2089: info->memory = ((PetscObject)A)->mem;
2090: if (A->factortype) {
2091: info->fill_ratio_given = A->info.fill_ratio_given;
2092: info->fill_ratio_needed = A->info.fill_ratio_needed;
2093: info->factor_mallocs = A->info.factor_mallocs;
2094: } else {
2095: info->fill_ratio_given = 0;
2096: info->fill_ratio_needed = 0;
2097: info->factor_mallocs = 0;
2098: }
2099: return(0);
2100: }
2104: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2105: {
2106: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2110: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2111: return(0);
2112: }