Actual source code: baij2.c
petsc-3.12.5 2020-03-29
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9: #include <immintrin.h>
10: #endif
12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17: const PetscInt *idx;
18: PetscInt start,end,*ai,*aj,bs,*nidx2;
19: PetscBT table;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: PetscBTCreate(m,&table);
30: PetscMalloc1(m+1,&nidx);
31: PetscMalloc1(A->rmap->N+1,&nidx2);
33: for (i=0; i<is_max; i++) {
34: /* Initialise the two local arrays */
35: isz = 0;
36: PetscBTMemzero(m,table);
38: /* Extract the indices, assume there can be duplicate entries */
39: ISGetIndices(is[i],&idx);
40: ISGetLocalSize(is[i],&n);
42: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43: for (j=0; j<n; ++j) {
44: ival = idx[j]/bs; /* convert the indices into block indices */
45: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(&is[i]);
51: k = 0;
52: for (j=0; j<ov; j++) { /* for each overlap*/
53: n = isz;
54: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55: row = nidx[k];
56: start = ai[row];
57: end = ai[row+1];
58: for (l = start; l<end; l++) {
59: val = aj[l];
60: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61: }
62: }
63: }
64: /* expand the Index Set */
65: for (j=0; j<isz; j++) {
66: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67: }
68: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
69: }
70: PetscBTDestroy(&table);
71: PetscFree(nidx);
72: PetscFree(nidx2);
73: return(0);
74: }
76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: const PetscInt *irow,*icol;
83: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84: PetscInt *aj = a->j,*ai = a->i;
85: MatScalar *mat_a;
86: Mat C;
87: PetscBool flag;
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscCalloc1(1+oldcols,&smap);
96: ssmap = smap;
97: PetscMalloc1(1+nrows,&lens);
98: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99: /* determine lens of each row */
100: for (i=0; i<nrows; i++) {
101: kstart = ai[irow[i]];
102: kend = kstart + a->ilen[irow[i]];
103: lens[i] = 0;
104: for (k=kstart; k<kend; k++) {
105: if (ssmap[aj[k]]) lens[i]++;
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ*)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115: PetscArrayzero(c->ilen,c->mbs);
116: C = *B;
117: } else {
118: MatCreate(PetscObjectComm((PetscObject)A),&C);
119: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120: MatSetType(C,((PetscObject)A)->type_name);
121: MatSeqBAIJSetPreallocation(C,bs,0,lens);
122: }
123: c = (Mat_SeqBAIJ*)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141: /* sort */
142: {
143: MatScalar *work;
144: PetscMalloc1(bs2,&work);
145: for (i=0; i<nrows; i++) {
146: PetscInt ilen;
147: mat_i = c->i[i];
148: mat_j = c->j + mat_i;
149: mat_a = c->a + mat_i*bs2;
150: ilen = c->ilen[i];
151: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152: }
153: PetscFree(work);
154: }
156: /* Free work space */
157: ISRestoreIndices(iscol,&icol);
158: PetscFree(smap);
159: PetscFree(lens);
160: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
163: ISRestoreIndices(isrow,&irow);
164: *B = C;
165: return(0);
166: }
168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171: IS is1,is2;
173: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174: const PetscInt *irow,*icol;
177: ISGetIndices(isrow,&irow);
178: ISGetIndices(iscol,&icol);
179: ISGetLocalSize(isrow,&nrows);
180: ISGetLocalSize(iscol,&ncols);
182: /* Verify if the indices corespond to each element in a block
183: and form the IS with compressed IS */
184: maxmnbs = PetscMax(a->mbs,a->nbs);
185: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186: PetscArrayzero(vary,a->mbs);
187: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190: }
191: count = 0;
192: for (i=0; i<nrows; i++) {
193: j = irow[i] / bs;
194: if ((vary[j]--)==bs) iary[count++] = j;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
198: PetscArrayzero(vary,a->nbs);
199: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200: for (i=0; i<a->nbs; i++) {
201: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202: }
203: count = 0;
204: for (i=0; i<ncols; i++) {
205: j = icol[i] / bs;
206: if ((vary[j]--)==bs) iary[count++] = j;
207: }
208: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209: ISRestoreIndices(isrow,&irow);
210: ISRestoreIndices(iscol,&icol);
211: PetscFree2(vary,iary);
213: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214: ISDestroy(&is1);
215: ISDestroy(&is2);
216: return(0);
217: }
219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
223: Mat_SubSppt *submatj = c->submatis1;
226: (*submatj->destroy)(C);
227: MatDestroySubMatrix_Private(submatj);
228: return(0);
229: }
231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234: PetscInt i;
235: Mat C;
236: Mat_SeqBAIJ *c;
237: Mat_SubSppt *submatj;
240: for (i=0; i<n; i++) {
241: C = (*mat)[i];
242: c = (Mat_SeqBAIJ*)C->data;
243: submatj = c->submatis1;
244: if (submatj) {
245: (*submatj->destroy)(C);
246: MatDestroySubMatrix_Private(submatj);
247: PetscFree(C->defaultvectype);
248: PetscLayoutDestroy(&C->rmap);
249: PetscLayoutDestroy(&C->cmap);
250: PetscHeaderDestroy(&C);
251: } else {
252: MatDestroy(&C);
253: }
254: }
255: PetscFree(*mat);
256: return(0);
257: }
259: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
262: PetscInt i;
265: if (scall == MAT_INITIAL_MATRIX) {
266: PetscCalloc1(n+1,B);
267: }
269: for (i=0; i<n; i++) {
270: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
271: }
272: return(0);
273: }
275: /* -------------------------------------------------------*/
276: /* Should check that shapes of vectors and matrices match */
277: /* -------------------------------------------------------*/
279: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280: {
281: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
282: PetscScalar *z,sum;
283: const PetscScalar *x;
284: const MatScalar *v;
285: PetscErrorCode ierr;
286: PetscInt mbs,i,n;
287: const PetscInt *idx,*ii,*ridx=NULL;
288: PetscBool usecprow=a->compressedrow.use;
291: VecGetArrayRead(xx,&x);
292: VecGetArray(zz,&z);
294: if (usecprow) {
295: mbs = a->compressedrow.nrows;
296: ii = a->compressedrow.i;
297: ridx = a->compressedrow.rindex;
298: PetscArrayzero(z,a->mbs);
299: } else {
300: mbs = a->mbs;
301: ii = a->i;
302: }
304: for (i=0; i<mbs; i++) {
305: n = ii[1] - ii[0];
306: v = a->a + ii[0];
307: idx = a->j + ii[0];
308: ii++;
309: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
310: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311: sum = 0.0;
312: PetscSparseDensePlusDot(sum,x,v,idx,n);
313: if (usecprow) {
314: z[ridx[i]] = sum;
315: } else {
316: z[i] = sum;
317: }
318: }
319: VecRestoreArrayRead(xx,&x);
320: VecRestoreArray(zz,&z);
321: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
322: return(0);
323: }
325: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326: {
327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
328: PetscScalar *z = 0,sum1,sum2,*zarray;
329: const PetscScalar *x,*xb;
330: PetscScalar x1,x2;
331: const MatScalar *v;
332: PetscErrorCode ierr;
333: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
334: PetscBool usecprow=a->compressedrow.use;
337: VecGetArrayRead(xx,&x);
338: VecGetArray(zz,&zarray);
340: idx = a->j;
341: v = a->a;
342: if (usecprow) {
343: mbs = a->compressedrow.nrows;
344: ii = a->compressedrow.i;
345: ridx = a->compressedrow.rindex;
346: PetscArrayzero(zarray,2*a->mbs);
347: } else {
348: mbs = a->mbs;
349: ii = a->i;
350: z = zarray;
351: }
353: for (i=0; i<mbs; i++) {
354: n = ii[1] - ii[0]; ii++;
355: sum1 = 0.0; sum2 = 0.0;
356: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358: for (j=0; j<n; j++) {
359: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360: sum1 += v[0]*x1 + v[2]*x2;
361: sum2 += v[1]*x1 + v[3]*x2;
362: v += 4;
363: }
364: if (usecprow) z = zarray + 2*ridx[i];
365: z[0] = sum1; z[1] = sum2;
366: if (!usecprow) z += 2;
367: }
368: VecRestoreArrayRead(xx,&x);
369: VecRestoreArray(zz,&zarray);
370: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
371: return(0);
372: }
374: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375: {
376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
378: const PetscScalar *x,*xb;
379: const MatScalar *v;
380: PetscErrorCode ierr;
381: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
382: PetscBool usecprow=a->compressedrow.use;
384: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385: #pragma disjoint(*v,*z,*xb)
386: #endif
389: VecGetArrayRead(xx,&x);
390: VecGetArray(zz,&zarray);
392: idx = a->j;
393: v = a->a;
394: if (usecprow) {
395: mbs = a->compressedrow.nrows;
396: ii = a->compressedrow.i;
397: ridx = a->compressedrow.rindex;
398: PetscArrayzero(zarray,3*a->mbs);
399: } else {
400: mbs = a->mbs;
401: ii = a->i;
402: z = zarray;
403: }
405: for (i=0; i<mbs; i++) {
406: n = ii[1] - ii[0]; ii++;
407: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
408: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
409: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410: for (j=0; j<n; j++) {
411: xb = x + 3*(*idx++);
412: x1 = xb[0];
413: x2 = xb[1];
414: x3 = xb[2];
416: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419: v += 9;
420: }
421: if (usecprow) z = zarray + 3*ridx[i];
422: z[0] = sum1; z[1] = sum2; z[2] = sum3;
423: if (!usecprow) z += 3;
424: }
425: VecRestoreArrayRead(xx,&x);
426: VecRestoreArray(zz,&zarray);
427: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
428: return(0);
429: }
431: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432: {
433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
434: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435: const PetscScalar *x,*xb;
436: const MatScalar *v;
437: PetscErrorCode ierr;
438: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
439: PetscBool usecprow=a->compressedrow.use;
442: VecGetArrayRead(xx,&x);
443: VecGetArray(zz,&zarray);
445: idx = a->j;
446: v = a->a;
447: if (usecprow) {
448: mbs = a->compressedrow.nrows;
449: ii = a->compressedrow.i;
450: ridx = a->compressedrow.rindex;
451: PetscArrayzero(zarray,4*a->mbs);
452: } else {
453: mbs = a->mbs;
454: ii = a->i;
455: z = zarray;
456: }
458: for (i=0; i<mbs; i++) {
459: n = ii[1] - ii[0];
460: ii++;
461: sum1 = 0.0;
462: sum2 = 0.0;
463: sum3 = 0.0;
464: sum4 = 0.0;
466: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
467: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468: for (j=0; j<n; j++) {
469: xb = x + 4*(*idx++);
470: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
472: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
473: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475: v += 16;
476: }
477: if (usecprow) z = zarray + 4*ridx[i];
478: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479: if (!usecprow) z += 4;
480: }
481: VecRestoreArrayRead(xx,&x);
482: VecRestoreArray(zz,&zarray);
483: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
484: return(0);
485: }
487: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488: {
489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
490: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
491: const PetscScalar *xb,*x;
492: const MatScalar *v;
493: PetscErrorCode ierr;
494: const PetscInt *idx,*ii,*ridx=NULL;
495: PetscInt mbs,i,j,n;
496: PetscBool usecprow=a->compressedrow.use;
499: VecGetArrayRead(xx,&x);
500: VecGetArray(zz,&zarray);
502: idx = a->j;
503: v = a->a;
504: if (usecprow) {
505: mbs = a->compressedrow.nrows;
506: ii = a->compressedrow.i;
507: ridx = a->compressedrow.rindex;
508: PetscArrayzero(zarray,5*a->mbs);
509: } else {
510: mbs = a->mbs;
511: ii = a->i;
512: z = zarray;
513: }
515: for (i=0; i<mbs; i++) {
516: n = ii[1] - ii[0]; ii++;
517: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520: for (j=0; j<n; j++) {
521: xb = x + 5*(*idx++);
522: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528: v += 25;
529: }
530: if (usecprow) z = zarray + 5*ridx[i];
531: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532: if (!usecprow) z += 5;
533: }
534: VecRestoreArrayRead(xx,&x);
535: VecRestoreArray(zz,&zarray);
536: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
537: return(0);
538: }
542: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
543: {
544: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
545: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
546: const PetscScalar *x,*xb;
547: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
548: const MatScalar *v;
549: PetscErrorCode ierr;
550: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
551: PetscBool usecprow=a->compressedrow.use;
554: VecGetArrayRead(xx,&x);
555: VecGetArray(zz,&zarray);
557: idx = a->j;
558: v = a->a;
559: if (usecprow) {
560: mbs = a->compressedrow.nrows;
561: ii = a->compressedrow.i;
562: ridx = a->compressedrow.rindex;
563: PetscArrayzero(zarray,6*a->mbs);
564: } else {
565: mbs = a->mbs;
566: ii = a->i;
567: z = zarray;
568: }
570: for (i=0; i<mbs; i++) {
571: n = ii[1] - ii[0];
572: ii++;
573: sum1 = 0.0;
574: sum2 = 0.0;
575: sum3 = 0.0;
576: sum4 = 0.0;
577: sum5 = 0.0;
578: sum6 = 0.0;
580: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
581: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582: for (j=0; j<n; j++) {
583: xb = x + 6*(*idx++);
584: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
585: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
586: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
587: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
588: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
589: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
590: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
591: v += 36;
592: }
593: if (usecprow) z = zarray + 6*ridx[i];
594: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
595: if (!usecprow) z += 6;
596: }
598: VecRestoreArrayRead(xx,&x);
599: VecRestoreArray(zz,&zarray);
600: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
601: return(0);
602: }
604: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
605: {
606: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
607: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
608: const PetscScalar *x,*xb;
609: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
610: const MatScalar *v;
611: PetscErrorCode ierr;
612: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
613: PetscBool usecprow=a->compressedrow.use;
616: VecGetArrayRead(xx,&x);
617: VecGetArray(zz,&zarray);
619: idx = a->j;
620: v = a->a;
621: if (usecprow) {
622: mbs = a->compressedrow.nrows;
623: ii = a->compressedrow.i;
624: ridx = a->compressedrow.rindex;
625: PetscArrayzero(zarray,7*a->mbs);
626: } else {
627: mbs = a->mbs;
628: ii = a->i;
629: z = zarray;
630: }
632: for (i=0; i<mbs; i++) {
633: n = ii[1] - ii[0];
634: ii++;
635: sum1 = 0.0;
636: sum2 = 0.0;
637: sum3 = 0.0;
638: sum4 = 0.0;
639: sum5 = 0.0;
640: sum6 = 0.0;
641: sum7 = 0.0;
643: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
644: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
645: for (j=0; j<n; j++) {
646: xb = x + 7*(*idx++);
647: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
648: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
649: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
650: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
651: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
652: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
653: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
654: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
655: v += 49;
656: }
657: if (usecprow) z = zarray + 7*ridx[i];
658: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
659: if (!usecprow) z += 7;
660: }
662: VecRestoreArrayRead(xx,&x);
663: VecRestoreArray(zz,&zarray);
664: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
665: return(0);
666: }
668: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
669: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
670: {
671: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
672: PetscScalar *z = 0,*work,*workt,*zarray;
673: const PetscScalar *x,*xb;
674: const MatScalar *v;
676: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
677: const PetscInt *idx,*ii,*ridx=NULL;
678: PetscInt k;
679: PetscBool usecprow=a->compressedrow.use;
681: __m256d a0,a1,a2,a3,a4,a5;
682: __m256d w0,w1,w2,w3;
683: __m256d z0,z1,z2;
684: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
687: VecGetArrayRead(xx,&x);
688: VecGetArray(zz,&zarray);
690: idx = a->j;
691: v = a->a;
692: if (usecprow) {
693: mbs = a->compressedrow.nrows;
694: ii = a->compressedrow.i;
695: ridx = a->compressedrow.rindex;
696: PetscArrayzero(zarray,bs*a->mbs);
697: } else {
698: mbs = a->mbs;
699: ii = a->i;
700: z = zarray;
701: }
703: if (!a->mult_work) {
704: k = PetscMax(A->rmap->n,A->cmap->n);
705: PetscMalloc1(k+1,&a->mult_work);
706: }
708: work = a->mult_work;
709: for (i=0; i<mbs; i++) {
710: n = ii[1] - ii[0]; ii++;
711: workt = work;
712: for (j=0; j<n; j++) {
713: xb = x + bs*(*idx++);
714: for (k=0; k<bs; k++) workt[k] = xb[k];
715: workt += bs;
716: }
717: if (usecprow) z = zarray + bs*ridx[i];
719: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
721: for (j=0; j<n; j++) {
722: /* first column of a */
723: w0 = _mm256_set1_pd(work[j*9 ]);
724: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
725: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
726: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
728: /* second column of a */
729: w1 = _mm256_set1_pd(work[j*9+ 1]);
730: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
731: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
732: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
734: /* third column of a */
735: w2 = _mm256_set1_pd(work[j*9 +2]);
736: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
737: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
738: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
740: /* fourth column of a */
741: w3 = _mm256_set1_pd(work[j*9+ 3]);
742: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
743: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
744: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
746: /* fifth column of a */
747: w0 = _mm256_set1_pd(work[j*9+ 4]);
748: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
749: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
750: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
752: /* sixth column of a */
753: w1 = _mm256_set1_pd(work[j*9+ 5]);
754: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
755: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
756: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
758: /* seventh column of a */
759: w2 = _mm256_set1_pd(work[j*9+ 6]);
760: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
761: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
762: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
764: /* eigth column of a */
765: w3 = _mm256_set1_pd(work[j*9+ 7]);
766: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
767: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
768: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
770: /* ninth column of a */
771: w0 = _mm256_set1_pd(work[j*9+ 8]);
772: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
773: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
774: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
775: }
777: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
779: v += n*bs2;
780: if (!usecprow) z += bs;
781: }
782: VecRestoreArrayRead(xx,&x);
783: VecRestoreArray(zz,&zarray);
784: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
785: return(0);
786: }
787: #endif
789: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
790: {
791: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
792: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
793: const PetscScalar *x,*xb;
794: PetscScalar *zarray,xv;
795: const MatScalar *v;
796: PetscErrorCode ierr;
797: const PetscInt *ii,*ij=a->j,*idx;
798: PetscInt mbs,i,j,k,n,*ridx=NULL;
799: PetscBool usecprow=a->compressedrow.use;
802: VecGetArrayRead(xx,&x);
803: VecGetArray(zz,&zarray);
805: v = a->a;
806: if (usecprow) {
807: mbs = a->compressedrow.nrows;
808: ii = a->compressedrow.i;
809: ridx = a->compressedrow.rindex;
810: PetscArrayzero(zarray,11*a->mbs);
811: } else {
812: mbs = a->mbs;
813: ii = a->i;
814: z = zarray;
815: }
817: for (i=0; i<mbs; i++) {
818: n = ii[i+1] - ii[i];
819: idx = ij + ii[i];
820: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
821: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
823: for (j=0; j<n; j++) {
824: xb = x + 11*(idx[j]);
826: for (k=0; k<11; k++) {
827: xv = xb[k];
828: sum1 += v[0]*xv;
829: sum2 += v[1]*xv;
830: sum3 += v[2]*xv;
831: sum4 += v[3]*xv;
832: sum5 += v[4]*xv;
833: sum6 += v[5]*xv;
834: sum7 += v[6]*xv;
835: sum8 += v[7]*xv;
836: sum9 += v[8]*xv;
837: sum10 += v[9]*xv;
838: sum11 += v[10]*xv;
839: v += 11;
840: }
841: }
842: if (usecprow) z = zarray + 11*ridx[i];
843: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
844: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
846: if (!usecprow) z += 11;
847: }
849: VecRestoreArrayRead(xx,&x);
850: VecRestoreArray(zz,&zarray);
851: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
852: return(0);
853: }
855: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
856: /* Default MatMult for block size 15 */
858: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
859: {
860: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
861: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
862: const PetscScalar *x,*xb;
863: PetscScalar *zarray,xv;
864: const MatScalar *v;
865: PetscErrorCode ierr;
866: const PetscInt *ii,*ij=a->j,*idx;
867: PetscInt mbs,i,j,k,n,*ridx=NULL;
868: PetscBool usecprow=a->compressedrow.use;
871: VecGetArrayRead(xx,&x);
872: VecGetArray(zz,&zarray);
874: v = a->a;
875: if (usecprow) {
876: mbs = a->compressedrow.nrows;
877: ii = a->compressedrow.i;
878: ridx = a->compressedrow.rindex;
879: PetscArrayzero(zarray,15*a->mbs);
880: } else {
881: mbs = a->mbs;
882: ii = a->i;
883: z = zarray;
884: }
886: for (i=0; i<mbs; i++) {
887: n = ii[i+1] - ii[i];
888: idx = ij + ii[i];
889: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
890: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
892: for (j=0; j<n; j++) {
893: xb = x + 15*(idx[j]);
895: for (k=0; k<15; k++) {
896: xv = xb[k];
897: sum1 += v[0]*xv;
898: sum2 += v[1]*xv;
899: sum3 += v[2]*xv;
900: sum4 += v[3]*xv;
901: sum5 += v[4]*xv;
902: sum6 += v[5]*xv;
903: sum7 += v[6]*xv;
904: sum8 += v[7]*xv;
905: sum9 += v[8]*xv;
906: sum10 += v[9]*xv;
907: sum11 += v[10]*xv;
908: sum12 += v[11]*xv;
909: sum13 += v[12]*xv;
910: sum14 += v[13]*xv;
911: sum15 += v[14]*xv;
912: v += 15;
913: }
914: }
915: if (usecprow) z = zarray + 15*ridx[i];
916: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
917: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
919: if (!usecprow) z += 15;
920: }
922: VecRestoreArrayRead(xx,&x);
923: VecRestoreArray(zz,&zarray);
924: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
925: return(0);
926: }
928: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
929: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
930: {
931: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
932: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
933: const PetscScalar *x,*xb;
934: PetscScalar x1,x2,x3,x4,*zarray;
935: const MatScalar *v;
936: PetscErrorCode ierr;
937: const PetscInt *ii,*ij=a->j,*idx;
938: PetscInt mbs,i,j,n,*ridx=NULL;
939: PetscBool usecprow=a->compressedrow.use;
942: VecGetArrayRead(xx,&x);
943: VecGetArray(zz,&zarray);
945: v = a->a;
946: if (usecprow) {
947: mbs = a->compressedrow.nrows;
948: ii = a->compressedrow.i;
949: ridx = a->compressedrow.rindex;
950: PetscArrayzero(zarray,15*a->mbs);
951: } else {
952: mbs = a->mbs;
953: ii = a->i;
954: z = zarray;
955: }
957: for (i=0; i<mbs; i++) {
958: n = ii[i+1] - ii[i];
959: idx = ij + ii[i];
960: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
961: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
963: for (j=0; j<n; j++) {
964: xb = x + 15*(idx[j]);
965: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
967: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
968: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
969: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
970: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
971: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
972: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
973: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
974: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
975: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
976: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
977: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
978: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
979: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
980: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
981: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
983: v += 60;
985: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
987: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
988: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
989: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
990: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
991: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
992: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
993: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
994: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
995: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
996: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
997: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
998: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
999: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1000: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1001: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1002: v += 60;
1004: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1005: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1006: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1007: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1008: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1009: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1010: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1011: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1012: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1013: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1014: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1015: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1016: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1017: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1018: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1019: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1020: v += 60;
1022: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1023: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1024: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1025: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1026: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1027: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1028: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1029: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1030: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1031: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1032: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1033: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1034: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1035: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1036: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1037: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1038: v += 45;
1039: }
1040: if (usecprow) z = zarray + 15*ridx[i];
1041: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1042: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1044: if (!usecprow) z += 15;
1045: }
1047: VecRestoreArrayRead(xx,&x);
1048: VecRestoreArray(zz,&zarray);
1049: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1050: return(0);
1051: }
1053: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1054: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1055: {
1056: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1057: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1058: const PetscScalar *x,*xb;
1059: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1060: const MatScalar *v;
1061: PetscErrorCode ierr;
1062: const PetscInt *ii,*ij=a->j,*idx;
1063: PetscInt mbs,i,j,n,*ridx=NULL;
1064: PetscBool usecprow=a->compressedrow.use;
1067: VecGetArrayRead(xx,&x);
1068: VecGetArray(zz,&zarray);
1070: v = a->a;
1071: if (usecprow) {
1072: mbs = a->compressedrow.nrows;
1073: ii = a->compressedrow.i;
1074: ridx = a->compressedrow.rindex;
1075: PetscArrayzero(zarray,15*a->mbs);
1076: } else {
1077: mbs = a->mbs;
1078: ii = a->i;
1079: z = zarray;
1080: }
1082: for (i=0; i<mbs; i++) {
1083: n = ii[i+1] - ii[i];
1084: idx = ij + ii[i];
1085: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1086: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1088: for (j=0; j<n; j++) {
1089: xb = x + 15*(idx[j]);
1090: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1091: x8 = xb[7];
1093: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1094: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1095: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1096: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1097: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1098: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1099: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1100: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1101: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1102: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1103: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1104: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1105: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1106: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1107: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1108: v += 120;
1110: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1112: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1113: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1114: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1115: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1116: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1117: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1118: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1119: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1120: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1121: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1122: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1123: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1124: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1125: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1126: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1127: v += 105;
1128: }
1129: if (usecprow) z = zarray + 15*ridx[i];
1130: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1131: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1133: if (!usecprow) z += 15;
1134: }
1136: VecRestoreArrayRead(xx,&x);
1137: VecRestoreArray(zz,&zarray);
1138: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1139: return(0);
1140: }
1142: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1144: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1145: {
1146: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1147: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1148: const PetscScalar *x,*xb;
1149: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1150: const MatScalar *v;
1151: PetscErrorCode ierr;
1152: const PetscInt *ii,*ij=a->j,*idx;
1153: PetscInt mbs,i,j,n,*ridx=NULL;
1154: PetscBool usecprow=a->compressedrow.use;
1157: VecGetArrayRead(xx,&x);
1158: VecGetArray(zz,&zarray);
1160: v = a->a;
1161: if (usecprow) {
1162: mbs = a->compressedrow.nrows;
1163: ii = a->compressedrow.i;
1164: ridx = a->compressedrow.rindex;
1165: PetscArrayzero(zarray,15*a->mbs);
1166: } else {
1167: mbs = a->mbs;
1168: ii = a->i;
1169: z = zarray;
1170: }
1172: for (i=0; i<mbs; i++) {
1173: n = ii[i+1] - ii[i];
1174: idx = ij + ii[i];
1175: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1176: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1178: for (j=0; j<n; j++) {
1179: xb = x + 15*(idx[j]);
1180: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1181: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1183: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1184: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1185: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1186: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1187: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1188: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1189: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1190: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1191: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1192: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1193: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1194: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1195: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1196: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1197: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1198: v += 225;
1199: }
1200: if (usecprow) z = zarray + 15*ridx[i];
1201: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1202: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1204: if (!usecprow) z += 15;
1205: }
1207: VecRestoreArrayRead(xx,&x);
1208: VecRestoreArray(zz,&zarray);
1209: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1210: return(0);
1211: }
1214: /*
1215: This will not work with MatScalar == float because it calls the BLAS
1216: */
1217: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1218: {
1219: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1220: PetscScalar *z = 0,*work,*workt,*zarray;
1221: const PetscScalar *x,*xb;
1222: const MatScalar *v;
1223: PetscErrorCode ierr;
1224: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1225: const PetscInt *idx,*ii,*ridx=NULL;
1226: PetscInt ncols,k;
1227: PetscBool usecprow=a->compressedrow.use;
1230: VecGetArrayRead(xx,&x);
1231: VecGetArray(zz,&zarray);
1233: idx = a->j;
1234: v = a->a;
1235: if (usecprow) {
1236: mbs = a->compressedrow.nrows;
1237: ii = a->compressedrow.i;
1238: ridx = a->compressedrow.rindex;
1239: PetscArrayzero(zarray,bs*a->mbs);
1240: } else {
1241: mbs = a->mbs;
1242: ii = a->i;
1243: z = zarray;
1244: }
1246: if (!a->mult_work) {
1247: k = PetscMax(A->rmap->n,A->cmap->n);
1248: PetscMalloc1(k+1,&a->mult_work);
1249: }
1250: work = a->mult_work;
1251: for (i=0; i<mbs; i++) {
1252: n = ii[1] - ii[0]; ii++;
1253: ncols = n*bs;
1254: workt = work;
1255: for (j=0; j<n; j++) {
1256: xb = x + bs*(*idx++);
1257: for (k=0; k<bs; k++) workt[k] = xb[k];
1258: workt += bs;
1259: }
1260: if (usecprow) z = zarray + bs*ridx[i];
1261: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1262: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1263: v += n*bs2;
1264: if (!usecprow) z += bs;
1265: }
1266: VecRestoreArrayRead(xx,&x);
1267: VecRestoreArray(zz,&zarray);
1268: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1269: return(0);
1270: }
1272: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1273: {
1274: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1275: const PetscScalar *x;
1276: PetscScalar *y,*z,sum;
1277: const MatScalar *v;
1278: PetscErrorCode ierr;
1279: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1280: const PetscInt *idx,*ii;
1281: PetscBool usecprow=a->compressedrow.use;
1284: VecGetArrayRead(xx,&x);
1285: VecGetArrayPair(yy,zz,&y,&z);
1287: idx = a->j;
1288: v = a->a;
1289: if (usecprow) {
1290: if (zz != yy) {
1291: PetscArraycpy(z,y,mbs);
1292: }
1293: mbs = a->compressedrow.nrows;
1294: ii = a->compressedrow.i;
1295: ridx = a->compressedrow.rindex;
1296: } else {
1297: ii = a->i;
1298: }
1300: for (i=0; i<mbs; i++) {
1301: n = ii[1] - ii[0];
1302: ii++;
1303: if (!usecprow) {
1304: sum = y[i];
1305: } else {
1306: sum = y[ridx[i]];
1307: }
1308: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1309: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1310: PetscSparseDensePlusDot(sum,x,v,idx,n);
1311: v += n;
1312: idx += n;
1313: if (usecprow) {
1314: z[ridx[i]] = sum;
1315: } else {
1316: z[i] = sum;
1317: }
1318: }
1319: VecRestoreArrayRead(xx,&x);
1320: VecRestoreArrayPair(yy,zz,&y,&z);
1321: PetscLogFlops(2.0*a->nz);
1322: return(0);
1323: }
1325: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1326: {
1327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1328: PetscScalar *y = 0,*z = 0,sum1,sum2;
1329: const PetscScalar *x,*xb;
1330: PetscScalar x1,x2,*yarray,*zarray;
1331: const MatScalar *v;
1332: PetscErrorCode ierr;
1333: PetscInt mbs = a->mbs,i,n,j;
1334: const PetscInt *idx,*ii,*ridx = NULL;
1335: PetscBool usecprow = a->compressedrow.use;
1338: VecGetArrayRead(xx,&x);
1339: VecGetArrayPair(yy,zz,&yarray,&zarray);
1341: idx = a->j;
1342: v = a->a;
1343: if (usecprow) {
1344: if (zz != yy) {
1345: PetscArraycpy(zarray,yarray,2*mbs);
1346: }
1347: mbs = a->compressedrow.nrows;
1348: ii = a->compressedrow.i;
1349: ridx = a->compressedrow.rindex;
1350: } else {
1351: ii = a->i;
1352: y = yarray;
1353: z = zarray;
1354: }
1356: for (i=0; i<mbs; i++) {
1357: n = ii[1] - ii[0]; ii++;
1358: if (usecprow) {
1359: z = zarray + 2*ridx[i];
1360: y = yarray + 2*ridx[i];
1361: }
1362: sum1 = y[0]; sum2 = y[1];
1363: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1364: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1365: for (j=0; j<n; j++) {
1366: xb = x + 2*(*idx++);
1367: x1 = xb[0];
1368: x2 = xb[1];
1370: sum1 += v[0]*x1 + v[2]*x2;
1371: sum2 += v[1]*x1 + v[3]*x2;
1372: v += 4;
1373: }
1374: z[0] = sum1; z[1] = sum2;
1375: if (!usecprow) {
1376: z += 2; y += 2;
1377: }
1378: }
1379: VecRestoreArrayRead(xx,&x);
1380: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1381: PetscLogFlops(4.0*a->nz);
1382: return(0);
1383: }
1385: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1386: {
1387: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1388: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1389: const PetscScalar *x,*xb;
1390: const MatScalar *v;
1391: PetscErrorCode ierr;
1392: PetscInt mbs = a->mbs,i,j,n;
1393: const PetscInt *idx,*ii,*ridx = NULL;
1394: PetscBool usecprow = a->compressedrow.use;
1397: VecGetArrayRead(xx,&x);
1398: VecGetArrayPair(yy,zz,&yarray,&zarray);
1400: idx = a->j;
1401: v = a->a;
1402: if (usecprow) {
1403: if (zz != yy) {
1404: PetscArraycpy(zarray,yarray,3*mbs);
1405: }
1406: mbs = a->compressedrow.nrows;
1407: ii = a->compressedrow.i;
1408: ridx = a->compressedrow.rindex;
1409: } else {
1410: ii = a->i;
1411: y = yarray;
1412: z = zarray;
1413: }
1415: for (i=0; i<mbs; i++) {
1416: n = ii[1] - ii[0]; ii++;
1417: if (usecprow) {
1418: z = zarray + 3*ridx[i];
1419: y = yarray + 3*ridx[i];
1420: }
1421: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1422: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1423: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1424: for (j=0; j<n; j++) {
1425: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1426: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1427: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1428: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1429: v += 9;
1430: }
1431: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1432: if (!usecprow) {
1433: z += 3; y += 3;
1434: }
1435: }
1436: VecRestoreArrayRead(xx,&x);
1437: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1438: PetscLogFlops(18.0*a->nz);
1439: return(0);
1440: }
1442: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1443: {
1444: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1445: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1446: const PetscScalar *x,*xb;
1447: const MatScalar *v;
1448: PetscErrorCode ierr;
1449: PetscInt mbs = a->mbs,i,j,n;
1450: const PetscInt *idx,*ii,*ridx=NULL;
1451: PetscBool usecprow=a->compressedrow.use;
1454: VecGetArrayRead(xx,&x);
1455: VecGetArrayPair(yy,zz,&yarray,&zarray);
1457: idx = a->j;
1458: v = a->a;
1459: if (usecprow) {
1460: if (zz != yy) {
1461: PetscArraycpy(zarray,yarray,4*mbs);
1462: }
1463: mbs = a->compressedrow.nrows;
1464: ii = a->compressedrow.i;
1465: ridx = a->compressedrow.rindex;
1466: } else {
1467: ii = a->i;
1468: y = yarray;
1469: z = zarray;
1470: }
1472: for (i=0; i<mbs; i++) {
1473: n = ii[1] - ii[0]; ii++;
1474: if (usecprow) {
1475: z = zarray + 4*ridx[i];
1476: y = yarray + 4*ridx[i];
1477: }
1478: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1479: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1480: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1481: for (j=0; j<n; j++) {
1482: xb = x + 4*(*idx++);
1483: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1484: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1485: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1486: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1487: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1488: v += 16;
1489: }
1490: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1491: if (!usecprow) {
1492: z += 4; y += 4;
1493: }
1494: }
1495: VecRestoreArrayRead(xx,&x);
1496: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1497: PetscLogFlops(32.0*a->nz);
1498: return(0);
1499: }
1501: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1502: {
1503: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1504: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1505: const PetscScalar *x,*xb;
1506: PetscScalar *yarray,*zarray;
1507: const MatScalar *v;
1508: PetscErrorCode ierr;
1509: PetscInt mbs = a->mbs,i,j,n;
1510: const PetscInt *idx,*ii,*ridx = NULL;
1511: PetscBool usecprow=a->compressedrow.use;
1514: VecGetArrayRead(xx,&x);
1515: VecGetArrayPair(yy,zz,&yarray,&zarray);
1517: idx = a->j;
1518: v = a->a;
1519: if (usecprow) {
1520: if (zz != yy) {
1521: PetscArraycpy(zarray,yarray,5*mbs);
1522: }
1523: mbs = a->compressedrow.nrows;
1524: ii = a->compressedrow.i;
1525: ridx = a->compressedrow.rindex;
1526: } else {
1527: ii = a->i;
1528: y = yarray;
1529: z = zarray;
1530: }
1532: for (i=0; i<mbs; i++) {
1533: n = ii[1] - ii[0]; ii++;
1534: if (usecprow) {
1535: z = zarray + 5*ridx[i];
1536: y = yarray + 5*ridx[i];
1537: }
1538: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1539: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1540: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1541: for (j=0; j<n; j++) {
1542: xb = x + 5*(*idx++);
1543: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1544: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1545: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1546: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1547: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1548: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1549: v += 25;
1550: }
1551: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1552: if (!usecprow) {
1553: z += 5; y += 5;
1554: }
1555: }
1556: VecRestoreArrayRead(xx,&x);
1557: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1558: PetscLogFlops(50.0*a->nz);
1559: return(0);
1560: }
1561: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1562: {
1563: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1564: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1565: const PetscScalar *x,*xb;
1566: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1567: const MatScalar *v;
1568: PetscErrorCode ierr;
1569: PetscInt mbs = a->mbs,i,j,n;
1570: const PetscInt *idx,*ii,*ridx=NULL;
1571: PetscBool usecprow=a->compressedrow.use;
1574: VecGetArrayRead(xx,&x);
1575: VecGetArrayPair(yy,zz,&yarray,&zarray);
1577: idx = a->j;
1578: v = a->a;
1579: if (usecprow) {
1580: if (zz != yy) {
1581: PetscArraycpy(zarray,yarray,6*mbs);
1582: }
1583: mbs = a->compressedrow.nrows;
1584: ii = a->compressedrow.i;
1585: ridx = a->compressedrow.rindex;
1586: } else {
1587: ii = a->i;
1588: y = yarray;
1589: z = zarray;
1590: }
1592: for (i=0; i<mbs; i++) {
1593: n = ii[1] - ii[0]; ii++;
1594: if (usecprow) {
1595: z = zarray + 6*ridx[i];
1596: y = yarray + 6*ridx[i];
1597: }
1598: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1599: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1600: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1601: for (j=0; j<n; j++) {
1602: xb = x + 6*(*idx++);
1603: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1604: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1605: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1606: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1607: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1608: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1609: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1610: v += 36;
1611: }
1612: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1613: if (!usecprow) {
1614: z += 6; y += 6;
1615: }
1616: }
1617: VecRestoreArrayRead(xx,&x);
1618: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1619: PetscLogFlops(72.0*a->nz);
1620: return(0);
1621: }
1623: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1624: {
1625: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1626: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1627: const PetscScalar *x,*xb;
1628: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1629: const MatScalar *v;
1630: PetscErrorCode ierr;
1631: PetscInt mbs = a->mbs,i,j,n;
1632: const PetscInt *idx,*ii,*ridx = NULL;
1633: PetscBool usecprow=a->compressedrow.use;
1636: VecGetArrayRead(xx,&x);
1637: VecGetArrayPair(yy,zz,&yarray,&zarray);
1639: idx = a->j;
1640: v = a->a;
1641: if (usecprow) {
1642: if (zz != yy) {
1643: PetscArraycpy(zarray,yarray,7*mbs);
1644: }
1645: mbs = a->compressedrow.nrows;
1646: ii = a->compressedrow.i;
1647: ridx = a->compressedrow.rindex;
1648: } else {
1649: ii = a->i;
1650: y = yarray;
1651: z = zarray;
1652: }
1654: for (i=0; i<mbs; i++) {
1655: n = ii[1] - ii[0]; ii++;
1656: if (usecprow) {
1657: z = zarray + 7*ridx[i];
1658: y = yarray + 7*ridx[i];
1659: }
1660: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1661: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1662: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1663: for (j=0; j<n; j++) {
1664: xb = x + 7*(*idx++);
1665: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1666: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1667: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1668: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1669: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1670: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1671: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1672: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1673: v += 49;
1674: }
1675: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1676: if (!usecprow) {
1677: z += 7; y += 7;
1678: }
1679: }
1680: VecRestoreArrayRead(xx,&x);
1681: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1682: PetscLogFlops(98.0*a->nz);
1683: return(0);
1684: }
1686: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1687: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1688: {
1689: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1690: PetscScalar *z = 0,*work,*workt,*zarray;
1691: const PetscScalar *x,*xb;
1692: const MatScalar *v;
1694: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1695: PetscInt k;
1696: PetscBool usecprow=a->compressedrow.use;
1697: const PetscInt *idx,*ii,*ridx=NULL;
1699: __m256d a0,a1,a2,a3,a4,a5;
1700: __m256d w0,w1,w2,w3;
1701: __m256d z0,z1,z2;
1702: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1705: VecCopy(yy,zz);
1706: VecGetArrayRead(xx,&x);
1707: VecGetArray(zz,&zarray);
1709: idx = a->j;
1710: v = a->a;
1711: if (usecprow) {
1712: mbs = a->compressedrow.nrows;
1713: ii = a->compressedrow.i;
1714: ridx = a->compressedrow.rindex;
1715: } else {
1716: mbs = a->mbs;
1717: ii = a->i;
1718: z = zarray;
1719: }
1721: if (!a->mult_work) {
1722: k = PetscMax(A->rmap->n,A->cmap->n);
1723: PetscMalloc1(k+1,&a->mult_work);
1724: }
1726: work = a->mult_work;
1727: for (i=0; i<mbs; i++) {
1728: n = ii[1] - ii[0]; ii++;
1729: workt = work;
1730: for (j=0; j<n; j++) {
1731: xb = x + bs*(*idx++);
1732: for (k=0; k<bs; k++) workt[k] = xb[k];
1733: workt += bs;
1734: }
1735: if (usecprow) z = zarray + bs*ridx[i];
1737: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1739: for (j=0; j<n; j++) {
1740: /* first column of a */
1741: w0 = _mm256_set1_pd(work[j*9 ]);
1742: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1743: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1744: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1746: /* second column of a */
1747: w1 = _mm256_set1_pd(work[j*9+ 1]);
1748: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1749: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1750: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1752: /* third column of a */
1753: w2 = _mm256_set1_pd(work[j*9+ 2]);
1754: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1755: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1756: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1758: /* fourth column of a */
1759: w3 = _mm256_set1_pd(work[j*9+ 3]);
1760: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1761: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1762: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1764: /* fifth column of a */
1765: w0 = _mm256_set1_pd(work[j*9+ 4]);
1766: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1767: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1768: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1770: /* sixth column of a */
1771: w1 = _mm256_set1_pd(work[j*9+ 5]);
1772: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1773: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1774: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1776: /* seventh column of a */
1777: w2 = _mm256_set1_pd(work[j*9+ 6]);
1778: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1779: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1780: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1782: /* eigth column of a */
1783: w3 = _mm256_set1_pd(work[j*9+ 7]);
1784: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1785: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1786: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1788: /* ninth column of a */
1789: w0 = _mm256_set1_pd(work[j*9+ 8]);
1790: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1791: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1792: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1793: }
1795: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1797: v += n*bs2;
1798: if (!usecprow) z += bs;
1799: }
1800: VecRestoreArrayRead(xx,&x);
1801: VecRestoreArray(zz,&zarray);
1802: PetscLogFlops(162.0*a->nz);
1803: return(0);
1804: }
1805: #endif
1807: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1808: {
1809: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1810: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1811: const PetscScalar *x,*xb;
1812: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1813: const MatScalar *v;
1814: PetscErrorCode ierr;
1815: PetscInt mbs = a->mbs,i,j,n;
1816: const PetscInt *idx,*ii,*ridx = NULL;
1817: PetscBool usecprow=a->compressedrow.use;
1820: VecGetArrayRead(xx,&x);
1821: VecGetArrayPair(yy,zz,&yarray,&zarray);
1823: idx = a->j;
1824: v = a->a;
1825: if (usecprow) {
1826: if (zz != yy) {
1827: PetscArraycpy(zarray,yarray,7*mbs);
1828: }
1829: mbs = a->compressedrow.nrows;
1830: ii = a->compressedrow.i;
1831: ridx = a->compressedrow.rindex;
1832: } else {
1833: ii = a->i;
1834: y = yarray;
1835: z = zarray;
1836: }
1838: for (i=0; i<mbs; i++) {
1839: n = ii[1] - ii[0]; ii++;
1840: if (usecprow) {
1841: z = zarray + 11*ridx[i];
1842: y = yarray + 11*ridx[i];
1843: }
1844: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1845: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1846: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1847: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1848: for (j=0; j<n; j++) {
1849: xb = x + 11*(*idx++);
1850: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1851: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1852: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1853: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1854: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1855: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1856: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1857: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1858: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1859: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1860: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1861: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1862: v += 121;
1863: }
1864: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1865: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1866: if (!usecprow) {
1867: z += 11; y += 11;
1868: }
1869: }
1870: VecRestoreArrayRead(xx,&x);
1871: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1872: PetscLogFlops(242.0*a->nz);
1873: return(0);
1874: }
1876: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1877: {
1878: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1879: PetscScalar *z = 0,*work,*workt,*zarray;
1880: const PetscScalar *x,*xb;
1881: const MatScalar *v;
1882: PetscErrorCode ierr;
1883: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1884: PetscInt ncols,k;
1885: const PetscInt *ridx = NULL,*idx,*ii;
1886: PetscBool usecprow = a->compressedrow.use;
1889: VecCopy(yy,zz);
1890: VecGetArrayRead(xx,&x);
1891: VecGetArray(zz,&zarray);
1893: idx = a->j;
1894: v = a->a;
1895: if (usecprow) {
1896: mbs = a->compressedrow.nrows;
1897: ii = a->compressedrow.i;
1898: ridx = a->compressedrow.rindex;
1899: } else {
1900: mbs = a->mbs;
1901: ii = a->i;
1902: z = zarray;
1903: }
1905: if (!a->mult_work) {
1906: k = PetscMax(A->rmap->n,A->cmap->n);
1907: PetscMalloc1(k+1,&a->mult_work);
1908: }
1909: work = a->mult_work;
1910: for (i=0; i<mbs; i++) {
1911: n = ii[1] - ii[0]; ii++;
1912: ncols = n*bs;
1913: workt = work;
1914: for (j=0; j<n; j++) {
1915: xb = x + bs*(*idx++);
1916: for (k=0; k<bs; k++) workt[k] = xb[k];
1917: workt += bs;
1918: }
1919: if (usecprow) z = zarray + bs*ridx[i];
1920: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1921: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1922: v += n*bs2;
1923: if (!usecprow) z += bs;
1924: }
1925: VecRestoreArrayRead(xx,&x);
1926: VecRestoreArray(zz,&zarray);
1927: PetscLogFlops(2.0*a->nz*bs2);
1928: return(0);
1929: }
1931: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1932: {
1933: PetscScalar zero = 0.0;
1937: VecSet(zz,zero);
1938: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1939: return(0);
1940: }
1942: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1943: {
1944: PetscScalar zero = 0.0;
1948: VecSet(zz,zero);
1949: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1950: return(0);
1951: }
1953: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1954: {
1955: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1956: PetscScalar *z,x1,x2,x3,x4,x5;
1957: const PetscScalar *x,*xb = NULL;
1958: const MatScalar *v;
1959: PetscErrorCode ierr;
1960: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1961: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1962: Mat_CompressedRow cprow = a->compressedrow;
1963: PetscBool usecprow = cprow.use;
1966: if (yy != zz) { VecCopy(yy,zz); }
1967: VecGetArrayRead(xx,&x);
1968: VecGetArray(zz,&z);
1970: idx = a->j;
1971: v = a->a;
1972: if (usecprow) {
1973: mbs = cprow.nrows;
1974: ii = cprow.i;
1975: ridx = cprow.rindex;
1976: } else {
1977: mbs=a->mbs;
1978: ii = a->i;
1979: xb = x;
1980: }
1982: switch (bs) {
1983: case 1:
1984: for (i=0; i<mbs; i++) {
1985: if (usecprow) xb = x + ridx[i];
1986: x1 = xb[0];
1987: ib = idx + ii[0];
1988: n = ii[1] - ii[0]; ii++;
1989: for (j=0; j<n; j++) {
1990: rval = ib[j];
1991: z[rval] += PetscConj(*v) * x1;
1992: v++;
1993: }
1994: if (!usecprow) xb++;
1995: }
1996: break;
1997: case 2:
1998: for (i=0; i<mbs; i++) {
1999: if (usecprow) xb = x + 2*ridx[i];
2000: x1 = xb[0]; x2 = xb[1];
2001: ib = idx + ii[0];
2002: n = ii[1] - ii[0]; ii++;
2003: for (j=0; j<n; j++) {
2004: rval = ib[j]*2;
2005: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2006: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2007: v += 4;
2008: }
2009: if (!usecprow) xb += 2;
2010: }
2011: break;
2012: case 3:
2013: for (i=0; i<mbs; i++) {
2014: if (usecprow) xb = x + 3*ridx[i];
2015: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2016: ib = idx + ii[0];
2017: n = ii[1] - ii[0]; ii++;
2018: for (j=0; j<n; j++) {
2019: rval = ib[j]*3;
2020: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2021: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2022: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2023: v += 9;
2024: }
2025: if (!usecprow) xb += 3;
2026: }
2027: break;
2028: case 4:
2029: for (i=0; i<mbs; i++) {
2030: if (usecprow) xb = x + 4*ridx[i];
2031: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2032: ib = idx + ii[0];
2033: n = ii[1] - ii[0]; ii++;
2034: for (j=0; j<n; j++) {
2035: rval = ib[j]*4;
2036: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2037: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2038: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2039: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2040: v += 16;
2041: }
2042: if (!usecprow) xb += 4;
2043: }
2044: break;
2045: case 5:
2046: for (i=0; i<mbs; i++) {
2047: if (usecprow) xb = x + 5*ridx[i];
2048: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2049: x4 = xb[3]; x5 = xb[4];
2050: ib = idx + ii[0];
2051: n = ii[1] - ii[0]; ii++;
2052: for (j=0; j<n; j++) {
2053: rval = ib[j]*5;
2054: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2055: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2056: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2057: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2058: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2059: v += 25;
2060: }
2061: if (!usecprow) xb += 5;
2062: }
2063: break;
2064: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2065: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2066: #if 0
2067: {
2068: PetscInt ncols,k,bs2=a->bs2;
2069: PetscScalar *work,*workt,zb;
2070: const PetscScalar *xtmp;
2071: if (!a->mult_work) {
2072: k = PetscMax(A->rmap->n,A->cmap->n);
2073: PetscMalloc1(k+1,&a->mult_work);
2074: }
2075: work = a->mult_work;
2076: xtmp = x;
2077: for (i=0; i<mbs; i++) {
2078: n = ii[1] - ii[0]; ii++;
2079: ncols = n*bs;
2080: PetscArrayzero(work,ncols);
2081: if (usecprow) xtmp = x + bs*ridx[i];
2082: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2083: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2084: v += n*bs2;
2085: if (!usecprow) xtmp += bs;
2086: workt = work;
2087: for (j=0; j<n; j++) {
2088: zb = z + bs*(*idx++);
2089: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2090: workt += bs;
2091: }
2092: }
2093: }
2094: #endif
2095: }
2096: VecRestoreArrayRead(xx,&x);
2097: VecRestoreArray(zz,&z);
2098: PetscLogFlops(2.0*a->nz*a->bs2);
2099: return(0);
2100: }
2102: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2103: {
2104: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2105: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2106: const PetscScalar *x,*xb = 0;
2107: const MatScalar *v;
2108: PetscErrorCode ierr;
2109: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2110: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2111: Mat_CompressedRow cprow = a->compressedrow;
2112: PetscBool usecprow=cprow.use;
2115: if (yy != zz) { VecCopy(yy,zz); }
2116: VecGetArrayRead(xx,&x);
2117: VecGetArray(zz,&z);
2119: idx = a->j;
2120: v = a->a;
2121: if (usecprow) {
2122: mbs = cprow.nrows;
2123: ii = cprow.i;
2124: ridx = cprow.rindex;
2125: } else {
2126: mbs=a->mbs;
2127: ii = a->i;
2128: xb = x;
2129: }
2131: switch (bs) {
2132: case 1:
2133: for (i=0; i<mbs; i++) {
2134: if (usecprow) xb = x + ridx[i];
2135: x1 = xb[0];
2136: ib = idx + ii[0];
2137: n = ii[1] - ii[0]; ii++;
2138: for (j=0; j<n; j++) {
2139: rval = ib[j];
2140: z[rval] += *v * x1;
2141: v++;
2142: }
2143: if (!usecprow) xb++;
2144: }
2145: break;
2146: case 2:
2147: for (i=0; i<mbs; i++) {
2148: if (usecprow) xb = x + 2*ridx[i];
2149: x1 = xb[0]; x2 = xb[1];
2150: ib = idx + ii[0];
2151: n = ii[1] - ii[0]; ii++;
2152: for (j=0; j<n; j++) {
2153: rval = ib[j]*2;
2154: z[rval++] += v[0]*x1 + v[1]*x2;
2155: z[rval++] += v[2]*x1 + v[3]*x2;
2156: v += 4;
2157: }
2158: if (!usecprow) xb += 2;
2159: }
2160: break;
2161: case 3:
2162: for (i=0; i<mbs; i++) {
2163: if (usecprow) xb = x + 3*ridx[i];
2164: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2165: ib = idx + ii[0];
2166: n = ii[1] - ii[0]; ii++;
2167: for (j=0; j<n; j++) {
2168: rval = ib[j]*3;
2169: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2170: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2171: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2172: v += 9;
2173: }
2174: if (!usecprow) xb += 3;
2175: }
2176: break;
2177: case 4:
2178: for (i=0; i<mbs; i++) {
2179: if (usecprow) xb = x + 4*ridx[i];
2180: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2181: ib = idx + ii[0];
2182: n = ii[1] - ii[0]; ii++;
2183: for (j=0; j<n; j++) {
2184: rval = ib[j]*4;
2185: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2186: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2187: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2188: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2189: v += 16;
2190: }
2191: if (!usecprow) xb += 4;
2192: }
2193: break;
2194: case 5:
2195: for (i=0; i<mbs; i++) {
2196: if (usecprow) xb = x + 5*ridx[i];
2197: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2198: x4 = xb[3]; x5 = xb[4];
2199: ib = idx + ii[0];
2200: n = ii[1] - ii[0]; ii++;
2201: for (j=0; j<n; j++) {
2202: rval = ib[j]*5;
2203: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2204: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2205: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2206: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2207: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2208: v += 25;
2209: }
2210: if (!usecprow) xb += 5;
2211: }
2212: break;
2213: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2214: PetscInt ncols,k;
2215: PetscScalar *work,*workt;
2216: const PetscScalar *xtmp;
2217: if (!a->mult_work) {
2218: k = PetscMax(A->rmap->n,A->cmap->n);
2219: PetscMalloc1(k+1,&a->mult_work);
2220: }
2221: work = a->mult_work;
2222: xtmp = x;
2223: for (i=0; i<mbs; i++) {
2224: n = ii[1] - ii[0]; ii++;
2225: ncols = n*bs;
2226: PetscArrayzero(work,ncols);
2227: if (usecprow) xtmp = x + bs*ridx[i];
2228: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2229: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2230: v += n*bs2;
2231: if (!usecprow) xtmp += bs;
2232: workt = work;
2233: for (j=0; j<n; j++) {
2234: zb = z + bs*(*idx++);
2235: for (k=0; k<bs; k++) zb[k] += workt[k];
2236: workt += bs;
2237: }
2238: }
2239: }
2240: }
2241: VecRestoreArrayRead(xx,&x);
2242: VecRestoreArray(zz,&z);
2243: PetscLogFlops(2.0*a->nz*a->bs2);
2244: return(0);
2245: }
2247: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2248: {
2249: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2250: PetscInt totalnz = a->bs2*a->nz;
2251: PetscScalar oalpha = alpha;
2253: PetscBLASInt one = 1,tnz;
2256: PetscBLASIntCast(totalnz,&tnz);
2257: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2258: PetscLogFlops(totalnz);
2259: return(0);
2260: }
2262: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2263: {
2265: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2266: MatScalar *v = a->a;
2267: PetscReal sum = 0.0;
2268: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2271: if (type == NORM_FROBENIUS) {
2272: #if defined(PETSC_USE_REAL___FP16)
2273: PetscBLASInt one = 1,cnt = bs2*nz;
2274: *norm = BLASnrm2_(&cnt,v,&one);
2275: #else
2276: for (i=0; i<bs2*nz; i++) {
2277: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2278: }
2279: #endif
2280: *norm = PetscSqrtReal(sum);
2281: PetscLogFlops(2*bs2*nz);
2282: } else if (type == NORM_1) { /* maximum column sum */
2283: PetscReal *tmp;
2284: PetscInt *bcol = a->j;
2285: PetscCalloc1(A->cmap->n+1,&tmp);
2286: for (i=0; i<nz; i++) {
2287: for (j=0; j<bs; j++) {
2288: k1 = bs*(*bcol) + j; /* column index */
2289: for (k=0; k<bs; k++) {
2290: tmp[k1] += PetscAbsScalar(*v); v++;
2291: }
2292: }
2293: bcol++;
2294: }
2295: *norm = 0.0;
2296: for (j=0; j<A->cmap->n; j++) {
2297: if (tmp[j] > *norm) *norm = tmp[j];
2298: }
2299: PetscFree(tmp);
2300: PetscLogFlops(PetscMax(bs2*nz-1,0));
2301: } else if (type == NORM_INFINITY) { /* maximum row sum */
2302: *norm = 0.0;
2303: for (k=0; k<bs; k++) {
2304: for (j=0; j<a->mbs; j++) {
2305: v = a->a + bs2*a->i[j] + k;
2306: sum = 0.0;
2307: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2308: for (k1=0; k1<bs; k1++) {
2309: sum += PetscAbsScalar(*v);
2310: v += bs;
2311: }
2312: }
2313: if (sum > *norm) *norm = sum;
2314: }
2315: }
2316: PetscLogFlops(PetscMax(bs2*nz-1,0));
2317: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2318: return(0);
2319: }
2322: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2323: {
2324: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2328: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2329: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2330: *flg = PETSC_FALSE;
2331: return(0);
2332: }
2334: /* if the a->i are the same */
2335: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2336: if (!*flg) return(0);
2338: /* if a->j are the same */
2339: PetscArraycmp(a->j,b->j,a->nz,flg);
2340: if (!*flg) return(0);
2342: /* if a->a are the same */
2343: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2344: return(0);
2346: }
2348: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2349: {
2350: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2352: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2353: PetscScalar *x,zero = 0.0;
2354: MatScalar *aa,*aa_j;
2357: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2358: bs = A->rmap->bs;
2359: aa = a->a;
2360: ai = a->i;
2361: aj = a->j;
2362: ambs = a->mbs;
2363: bs2 = a->bs2;
2365: VecSet(v,zero);
2366: VecGetArray(v,&x);
2367: VecGetLocalSize(v,&n);
2368: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2369: for (i=0; i<ambs; i++) {
2370: for (j=ai[i]; j<ai[i+1]; j++) {
2371: if (aj[j] == i) {
2372: row = i*bs;
2373: aa_j = aa+j*bs2;
2374: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2375: break;
2376: }
2377: }
2378: }
2379: VecRestoreArray(v,&x);
2380: return(0);
2381: }
2383: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2384: {
2385: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2386: const PetscScalar *l,*r,*li,*ri;
2387: PetscScalar x;
2388: MatScalar *aa, *v;
2389: PetscErrorCode ierr;
2390: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2391: const PetscInt *ai,*aj;
2394: ai = a->i;
2395: aj = a->j;
2396: aa = a->a;
2397: m = A->rmap->n;
2398: n = A->cmap->n;
2399: bs = A->rmap->bs;
2400: mbs = a->mbs;
2401: bs2 = a->bs2;
2402: if (ll) {
2403: VecGetArrayRead(ll,&l);
2404: VecGetLocalSize(ll,&lm);
2405: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2406: for (i=0; i<mbs; i++) { /* for each block row */
2407: M = ai[i+1] - ai[i];
2408: li = l + i*bs;
2409: v = aa + bs2*ai[i];
2410: for (j=0; j<M; j++) { /* for each block */
2411: for (k=0; k<bs2; k++) {
2412: (*v++) *= li[k%bs];
2413: }
2414: }
2415: }
2416: VecRestoreArrayRead(ll,&l);
2417: PetscLogFlops(a->nz);
2418: }
2420: if (rr) {
2421: VecGetArrayRead(rr,&r);
2422: VecGetLocalSize(rr,&rn);
2423: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2424: for (i=0; i<mbs; i++) { /* for each block row */
2425: iai = ai[i];
2426: M = ai[i+1] - iai;
2427: v = aa + bs2*iai;
2428: for (j=0; j<M; j++) { /* for each block */
2429: ri = r + bs*aj[iai+j];
2430: for (k=0; k<bs; k++) {
2431: x = ri[k];
2432: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2433: v += bs;
2434: }
2435: }
2436: }
2437: VecRestoreArrayRead(rr,&r);
2438: PetscLogFlops(a->nz);
2439: }
2440: return(0);
2441: }
2444: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2445: {
2446: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2449: info->block_size = a->bs2;
2450: info->nz_allocated = a->bs2*a->maxnz;
2451: info->nz_used = a->bs2*a->nz;
2452: info->nz_unneeded = info->nz_allocated - info->nz_used;
2453: info->assemblies = A->num_ass;
2454: info->mallocs = A->info.mallocs;
2455: info->memory = ((PetscObject)A)->mem;
2456: if (A->factortype) {
2457: info->fill_ratio_given = A->info.fill_ratio_given;
2458: info->fill_ratio_needed = A->info.fill_ratio_needed;
2459: info->factor_mallocs = A->info.factor_mallocs;
2460: } else {
2461: info->fill_ratio_given = 0;
2462: info->fill_ratio_needed = 0;
2463: info->factor_mallocs = 0;
2464: }
2465: return(0);
2466: }
2468: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2469: {
2470: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2474: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2475: return(0);
2476: }
2478: PETSC_INTERN PetscErrorCode MatMatMult_SeqBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2479: {
2483: if (scall == MAT_INITIAL_MATRIX) {
2484: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
2485: MatMatMultSymbolic_SeqBAIJ_SeqDense(A,B,fill,C);
2486: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
2487: }
2488: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
2489: MatMatMultNumeric_SeqBAIJ_SeqDense(A,B,*C);
2490: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
2491: return(0);
2492: }
2494: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
2495: {
2499: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2501: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2502: return(0);
2503: }
2505: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2506: {
2507: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2508: PetscScalar *z = 0,sum1;
2509: const PetscScalar *xb;
2510: PetscScalar x1;
2511: const MatScalar *v,*vv;
2512: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2513: PetscBool usecprow=a->compressedrow.use;
2517: idx = a->j;
2518: v = a->a;
2519: if (usecprow) {
2520: mbs = a->compressedrow.nrows;
2521: ii = a->compressedrow.i;
2522: ridx = a->compressedrow.rindex;
2523: } else {
2524: mbs = a->mbs;
2525: ii = a->i;
2526: z = c;
2527: }
2529: for (i=0; i<mbs; i++) {
2530: n = ii[1] - ii[0]; ii++;
2531: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2532: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2533: if (usecprow) z = c + ridx[i];
2534: jj = idx;
2535: vv = v;
2536: for (k=0; k<cn; k++) {
2537: idx = jj;
2538: v = vv;
2539: sum1 = 0.0;
2540: for (j=0; j<n; j++) {
2541: xb = b + (*idx++); x1 = xb[0+k*cm];
2542: sum1 += v[0]*x1;
2543: v += 1;
2544: }
2545: z[0+k*cm] = sum1;
2546: }
2547: if (!usecprow) z += 1;
2548: }
2549: return(0);
2550: }
2552: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2553: {
2554: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2555: PetscScalar *z = 0,sum1,sum2;
2556: const PetscScalar *xb;
2557: PetscScalar x1,x2;
2558: const MatScalar *v,*vv;
2559: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2560: PetscBool usecprow=a->compressedrow.use;
2564: idx = a->j;
2565: v = a->a;
2566: if (usecprow) {
2567: mbs = a->compressedrow.nrows;
2568: ii = a->compressedrow.i;
2569: ridx = a->compressedrow.rindex;
2570: } else {
2571: mbs = a->mbs;
2572: ii = a->i;
2573: z = c;
2574: }
2576: for (i=0; i<mbs; i++) {
2577: n = ii[1] - ii[0]; ii++;
2578: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2579: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2580: if (usecprow) z = c + 2*ridx[i];
2581: jj = idx;
2582: vv = v;
2583: for (k=0; k<cn; k++) {
2584: idx = jj;
2585: v = vv;
2586: sum1 = 0.0; sum2 = 0.0;
2587: for (j=0; j<n; j++) {
2588: xb = b + 2*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm];
2589: sum1 += v[0]*x1 + v[2]*x2;
2590: sum2 += v[1]*x1 + v[3]*x2;
2591: v += 4;
2592: }
2593: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2594: }
2595: if (!usecprow) z += 2;
2596: }
2597: return(0);
2598: }
2600: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2601: {
2602: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2603: PetscScalar *z = 0,sum1,sum2,sum3;
2604: const PetscScalar *xb;
2605: PetscScalar x1,x2,x3;
2606: const MatScalar *v,*vv;
2607: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2608: PetscBool usecprow=a->compressedrow.use;
2612: idx = a->j;
2613: v = a->a;
2614: if (usecprow) {
2615: mbs = a->compressedrow.nrows;
2616: ii = a->compressedrow.i;
2617: ridx = a->compressedrow.rindex;
2618: } else {
2619: mbs = a->mbs;
2620: ii = a->i;
2621: z = c;
2622: }
2624: for (i=0; i<mbs; i++) {
2625: n = ii[1] - ii[0]; ii++;
2626: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2627: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2628: if (usecprow) z = c + 3*ridx[i];
2629: jj = idx;
2630: vv = v;
2631: for (k=0; k<cn; k++) {
2632: idx = jj;
2633: v = vv;
2634: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
2635: for (j=0; j<n; j++) {
2636: xb = b + 3*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm];
2637: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2638: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2639: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2640: v += 9;
2641: }
2642: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2643: }
2644: if (!usecprow) z += 3;
2645: }
2646: return(0);
2647: }
2649: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2650: {
2651: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2652: PetscScalar *z = 0,sum1,sum2,sum3,sum4;
2653: const PetscScalar *xb;
2654: PetscScalar x1,x2,x3,x4;
2655: const MatScalar *v,*vv;
2656: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2657: PetscBool usecprow=a->compressedrow.use;
2661: idx = a->j;
2662: v = a->a;
2663: if (usecprow) {
2664: mbs = a->compressedrow.nrows;
2665: ii = a->compressedrow.i;
2666: ridx = a->compressedrow.rindex;
2667: } else {
2668: mbs = a->mbs;
2669: ii = a->i;
2670: z = c;
2671: }
2673: for (i=0; i<mbs; i++) {
2674: n = ii[1] - ii[0]; ii++;
2675: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2676: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2677: if (usecprow) z = c + 4*ridx[i];
2678: jj = idx;
2679: vv = v;
2680: for (k=0; k<cn; k++) {
2681: idx = jj;
2682: v = vv;
2683: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2684: for (j=0; j<n; j++) {
2685: xb = b + 4*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm];
2686: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2687: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2688: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2689: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2690: v += 16;
2691: }
2692: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2693: }
2694: if (!usecprow) z += 4;
2695: }
2696: return(0);
2697: }
2699: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2700: {
2701: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2702: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5;
2703: const PetscScalar *xb;
2704: PetscScalar x1,x2,x3,x4,x5;
2705: const MatScalar *v,*vv;
2706: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2707: PetscBool usecprow=a->compressedrow.use;
2711: idx = a->j;
2712: v = a->a;
2713: if (usecprow) {
2714: mbs = a->compressedrow.nrows;
2715: ii = a->compressedrow.i;
2716: ridx = a->compressedrow.rindex;
2717: } else {
2718: mbs = a->mbs;
2719: ii = a->i;
2720: z = c;
2721: }
2723: for (i=0; i<mbs; i++) {
2724: n = ii[1] - ii[0]; ii++;
2725: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2726: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2727: if (usecprow) z = c + 5*ridx[i];
2728: jj = idx;
2729: vv = v;
2730: for (k=0; k<cn; k++) {
2731: idx = jj;
2732: v = vv;
2733: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2734: for (j=0; j<n; j++) {
2735: xb = b + 5*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm]; x5 = xb[4+k*cm];
2736: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2737: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2738: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2739: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2740: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2741: v += 25;
2742: }
2743: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2744: }
2745: if (!usecprow) z += 5;
2746: }
2747: return(0);
2748: }
2750: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2751: {
2752: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2753: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
2754: Mat_SeqDense *cd = (Mat_SeqDense*)B->data;
2755: PetscErrorCode ierr;
2756: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2757: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2758: PetscBLASInt bbs,bcn,bbm,bcm;
2759: PetscScalar *z = 0;
2760: PetscScalar *c,*b;
2761: const MatScalar *v;
2762: const PetscInt *idx,*ii,*ridx=NULL;
2763: PetscScalar _DZero=0.0,_DOne=1.0;
2764: PetscBool usecprow=a->compressedrow.use;
2767: if (!cm || !cn) return(0);
2768: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2769: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2770: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2771: b = bd->v;
2772: if (a->nonzerorowcnt != A->rmap->n) {
2773: MatZeroEntries(C);
2774: }
2775: MatDenseGetArray(C,&c);
2776: switch (bs) {
2777: case 1:
2778: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2779: break;
2780: case 2:
2781: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2782: break;
2783: case 3:
2784: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2785: break;
2786: case 4:
2787: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2788: break;
2789: case 5:
2790: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2791: break;
2792: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2793: PetscBLASIntCast(bs,&bbs);
2794: PetscBLASIntCast(cn,&bcn);
2795: PetscBLASIntCast(bm,&bbm);
2796: PetscBLASIntCast(cm,&bcm);
2797: idx = a->j;
2798: v = a->a;
2799: if (usecprow) {
2800: mbs = a->compressedrow.nrows;
2801: ii = a->compressedrow.i;
2802: ridx = a->compressedrow.rindex;
2803: } else {
2804: mbs = a->mbs;
2805: ii = a->i;
2806: z = c;
2807: }
2808: for (i=0; i<mbs; i++) {
2809: n = ii[1] - ii[0]; ii++;
2810: if (usecprow) z = c + bs*ridx[i];
2811: if (n) {
2812: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2813: v += bs2;
2814: }
2815: for (j=1; j<n; j++) {
2816: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2817: v += bs2;
2818: }
2819: if (!usecprow) z += bs;
2820: }
2821: }
2822: MatDenseRestoreArray(C,&c);
2823: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2824: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2825: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
2826: return(0);
2827: }