Actual source code: baij2.c
petsc-3.13.6 2020-09-29
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9: #include <immintrin.h>
10: #endif
12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17: const PetscInt *idx;
18: PetscInt start,end,*ai,*aj,bs,*nidx2;
19: PetscBT table;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: PetscBTCreate(m,&table);
30: PetscMalloc1(m+1,&nidx);
31: PetscMalloc1(A->rmap->N+1,&nidx2);
33: for (i=0; i<is_max; i++) {
34: /* Initialise the two local arrays */
35: isz = 0;
36: PetscBTMemzero(m,table);
38: /* Extract the indices, assume there can be duplicate entries */
39: ISGetIndices(is[i],&idx);
40: ISGetLocalSize(is[i],&n);
42: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43: for (j=0; j<n; ++j) {
44: ival = idx[j]/bs; /* convert the indices into block indices */
45: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(&is[i]);
51: k = 0;
52: for (j=0; j<ov; j++) { /* for each overlap*/
53: n = isz;
54: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55: row = nidx[k];
56: start = ai[row];
57: end = ai[row+1];
58: for (l = start; l<end; l++) {
59: val = aj[l];
60: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61: }
62: }
63: }
64: /* expand the Index Set */
65: for (j=0; j<isz; j++) {
66: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67: }
68: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
69: }
70: PetscBTDestroy(&table);
71: PetscFree(nidx);
72: PetscFree(nidx2);
73: return(0);
74: }
76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: const PetscInt *irow,*icol;
83: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84: PetscInt *aj = a->j,*ai = a->i;
85: MatScalar *mat_a;
86: Mat C;
87: PetscBool flag;
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscCalloc1(1+oldcols,&smap);
96: ssmap = smap;
97: PetscMalloc1(1+nrows,&lens);
98: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99: /* determine lens of each row */
100: for (i=0; i<nrows; i++) {
101: kstart = ai[irow[i]];
102: kend = kstart + a->ilen[irow[i]];
103: lens[i] = 0;
104: for (k=kstart; k<kend; k++) {
105: if (ssmap[aj[k]]) lens[i]++;
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ*)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115: PetscArrayzero(c->ilen,c->mbs);
116: C = *B;
117: } else {
118: MatCreate(PetscObjectComm((PetscObject)A),&C);
119: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120: MatSetType(C,((PetscObject)A)->type_name);
121: MatSeqBAIJSetPreallocation(C,bs,0,lens);
122: }
123: c = (Mat_SeqBAIJ*)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141: /* sort */
142: {
143: MatScalar *work;
144: PetscMalloc1(bs2,&work);
145: for (i=0; i<nrows; i++) {
146: PetscInt ilen;
147: mat_i = c->i[i];
148: mat_j = c->j + mat_i;
149: mat_a = c->a + mat_i*bs2;
150: ilen = c->ilen[i];
151: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152: }
153: PetscFree(work);
154: }
156: /* Free work space */
157: ISRestoreIndices(iscol,&icol);
158: PetscFree(smap);
159: PetscFree(lens);
160: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
163: ISRestoreIndices(isrow,&irow);
164: *B = C;
165: return(0);
166: }
168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171: IS is1,is2;
173: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174: const PetscInt *irow,*icol;
177: ISGetIndices(isrow,&irow);
178: ISGetIndices(iscol,&icol);
179: ISGetLocalSize(isrow,&nrows);
180: ISGetLocalSize(iscol,&ncols);
182: /* Verify if the indices corespond to each element in a block
183: and form the IS with compressed IS */
184: maxmnbs = PetscMax(a->mbs,a->nbs);
185: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186: PetscArrayzero(vary,a->mbs);
187: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190: }
191: count = 0;
192: for (i=0; i<nrows; i++) {
193: j = irow[i] / bs;
194: if ((vary[j]--)==bs) iary[count++] = j;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
198: PetscArrayzero(vary,a->nbs);
199: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200: for (i=0; i<a->nbs; i++) {
201: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202: }
203: count = 0;
204: for (i=0; i<ncols; i++) {
205: j = icol[i] / bs;
206: if ((vary[j]--)==bs) iary[count++] = j;
207: }
208: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209: ISRestoreIndices(isrow,&irow);
210: ISRestoreIndices(iscol,&icol);
211: PetscFree2(vary,iary);
213: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214: ISDestroy(&is1);
215: ISDestroy(&is2);
216: return(0);
217: }
219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
223: Mat_SubSppt *submatj = c->submatis1;
226: (*submatj->destroy)(C);
227: MatDestroySubMatrix_Private(submatj);
228: return(0);
229: }
231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234: PetscInt i;
235: Mat C;
236: Mat_SeqBAIJ *c;
237: Mat_SubSppt *submatj;
240: for (i=0; i<n; i++) {
241: C = (*mat)[i];
242: c = (Mat_SeqBAIJ*)C->data;
243: submatj = c->submatis1;
244: if (submatj) {
245: (*submatj->destroy)(C);
246: MatDestroySubMatrix_Private(submatj);
247: PetscFree(C->defaultvectype);
248: PetscLayoutDestroy(&C->rmap);
249: PetscLayoutDestroy(&C->cmap);
250: PetscHeaderDestroy(&C);
251: } else {
252: MatDestroy(&C);
253: }
254: }
255: PetscFree(*mat);
256: return(0);
257: }
259: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
262: PetscInt i;
265: if (scall == MAT_INITIAL_MATRIX) {
266: PetscCalloc1(n+1,B);
267: }
269: for (i=0; i<n; i++) {
270: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
271: }
272: return(0);
273: }
275: /* -------------------------------------------------------*/
276: /* Should check that shapes of vectors and matrices match */
277: /* -------------------------------------------------------*/
279: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280: {
281: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
282: PetscScalar *z,sum;
283: const PetscScalar *x;
284: const MatScalar *v;
285: PetscErrorCode ierr;
286: PetscInt mbs,i,n;
287: const PetscInt *idx,*ii,*ridx=NULL;
288: PetscBool usecprow=a->compressedrow.use;
291: VecGetArrayRead(xx,&x);
292: VecGetArray(zz,&z);
294: if (usecprow) {
295: mbs = a->compressedrow.nrows;
296: ii = a->compressedrow.i;
297: ridx = a->compressedrow.rindex;
298: PetscArrayzero(z,a->mbs);
299: } else {
300: mbs = a->mbs;
301: ii = a->i;
302: }
304: for (i=0; i<mbs; i++) {
305: n = ii[1] - ii[0];
306: v = a->a + ii[0];
307: idx = a->j + ii[0];
308: ii++;
309: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
310: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311: sum = 0.0;
312: PetscSparseDensePlusDot(sum,x,v,idx,n);
313: if (usecprow) {
314: z[ridx[i]] = sum;
315: } else {
316: z[i] = sum;
317: }
318: }
319: VecRestoreArrayRead(xx,&x);
320: VecRestoreArray(zz,&z);
321: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
322: return(0);
323: }
325: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326: {
327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
328: PetscScalar *z = 0,sum1,sum2,*zarray;
329: const PetscScalar *x,*xb;
330: PetscScalar x1,x2;
331: const MatScalar *v;
332: PetscErrorCode ierr;
333: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
334: PetscBool usecprow=a->compressedrow.use;
337: VecGetArrayRead(xx,&x);
338: VecGetArray(zz,&zarray);
340: idx = a->j;
341: v = a->a;
342: if (usecprow) {
343: mbs = a->compressedrow.nrows;
344: ii = a->compressedrow.i;
345: ridx = a->compressedrow.rindex;
346: PetscArrayzero(zarray,2*a->mbs);
347: } else {
348: mbs = a->mbs;
349: ii = a->i;
350: z = zarray;
351: }
353: for (i=0; i<mbs; i++) {
354: n = ii[1] - ii[0]; ii++;
355: sum1 = 0.0; sum2 = 0.0;
356: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358: for (j=0; j<n; j++) {
359: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360: sum1 += v[0]*x1 + v[2]*x2;
361: sum2 += v[1]*x1 + v[3]*x2;
362: v += 4;
363: }
364: if (usecprow) z = zarray + 2*ridx[i];
365: z[0] = sum1; z[1] = sum2;
366: if (!usecprow) z += 2;
367: }
368: VecRestoreArrayRead(xx,&x);
369: VecRestoreArray(zz,&zarray);
370: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
371: return(0);
372: }
374: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375: {
376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
378: const PetscScalar *x,*xb;
379: const MatScalar *v;
380: PetscErrorCode ierr;
381: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
382: PetscBool usecprow=a->compressedrow.use;
384: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385: #pragma disjoint(*v,*z,*xb)
386: #endif
389: VecGetArrayRead(xx,&x);
390: VecGetArray(zz,&zarray);
392: idx = a->j;
393: v = a->a;
394: if (usecprow) {
395: mbs = a->compressedrow.nrows;
396: ii = a->compressedrow.i;
397: ridx = a->compressedrow.rindex;
398: PetscArrayzero(zarray,3*a->mbs);
399: } else {
400: mbs = a->mbs;
401: ii = a->i;
402: z = zarray;
403: }
405: for (i=0; i<mbs; i++) {
406: n = ii[1] - ii[0]; ii++;
407: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
408: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
409: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410: for (j=0; j<n; j++) {
411: xb = x + 3*(*idx++);
412: x1 = xb[0];
413: x2 = xb[1];
414: x3 = xb[2];
416: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419: v += 9;
420: }
421: if (usecprow) z = zarray + 3*ridx[i];
422: z[0] = sum1; z[1] = sum2; z[2] = sum3;
423: if (!usecprow) z += 3;
424: }
425: VecRestoreArrayRead(xx,&x);
426: VecRestoreArray(zz,&zarray);
427: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
428: return(0);
429: }
431: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432: {
433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
434: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435: const PetscScalar *x,*xb;
436: const MatScalar *v;
437: PetscErrorCode ierr;
438: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
439: PetscBool usecprow=a->compressedrow.use;
442: VecGetArrayRead(xx,&x);
443: VecGetArray(zz,&zarray);
445: idx = a->j;
446: v = a->a;
447: if (usecprow) {
448: mbs = a->compressedrow.nrows;
449: ii = a->compressedrow.i;
450: ridx = a->compressedrow.rindex;
451: PetscArrayzero(zarray,4*a->mbs);
452: } else {
453: mbs = a->mbs;
454: ii = a->i;
455: z = zarray;
456: }
458: for (i=0; i<mbs; i++) {
459: n = ii[1] - ii[0];
460: ii++;
461: sum1 = 0.0;
462: sum2 = 0.0;
463: sum3 = 0.0;
464: sum4 = 0.0;
466: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
467: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468: for (j=0; j<n; j++) {
469: xb = x + 4*(*idx++);
470: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
472: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
473: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475: v += 16;
476: }
477: if (usecprow) z = zarray + 4*ridx[i];
478: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479: if (!usecprow) z += 4;
480: }
481: VecRestoreArrayRead(xx,&x);
482: VecRestoreArray(zz,&zarray);
483: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
484: return(0);
485: }
487: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488: {
489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
490: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
491: const PetscScalar *xb,*x;
492: const MatScalar *v;
493: PetscErrorCode ierr;
494: const PetscInt *idx,*ii,*ridx=NULL;
495: PetscInt mbs,i,j,n;
496: PetscBool usecprow=a->compressedrow.use;
499: VecGetArrayRead(xx,&x);
500: VecGetArray(zz,&zarray);
502: idx = a->j;
503: v = a->a;
504: if (usecprow) {
505: mbs = a->compressedrow.nrows;
506: ii = a->compressedrow.i;
507: ridx = a->compressedrow.rindex;
508: PetscArrayzero(zarray,5*a->mbs);
509: } else {
510: mbs = a->mbs;
511: ii = a->i;
512: z = zarray;
513: }
515: for (i=0; i<mbs; i++) {
516: n = ii[1] - ii[0]; ii++;
517: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520: for (j=0; j<n; j++) {
521: xb = x + 5*(*idx++);
522: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528: v += 25;
529: }
530: if (usecprow) z = zarray + 5*ridx[i];
531: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532: if (!usecprow) z += 5;
533: }
534: VecRestoreArrayRead(xx,&x);
535: VecRestoreArray(zz,&zarray);
536: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
537: return(0);
538: }
540: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541: {
542: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
543: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
544: const PetscScalar *x,*xb;
545: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
546: const MatScalar *v;
547: PetscErrorCode ierr;
548: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
549: PetscBool usecprow=a->compressedrow.use;
552: VecGetArrayRead(xx,&x);
553: VecGetArray(zz,&zarray);
555: idx = a->j;
556: v = a->a;
557: if (usecprow) {
558: mbs = a->compressedrow.nrows;
559: ii = a->compressedrow.i;
560: ridx = a->compressedrow.rindex;
561: PetscArrayzero(zarray,6*a->mbs);
562: } else {
563: mbs = a->mbs;
564: ii = a->i;
565: z = zarray;
566: }
568: for (i=0; i<mbs; i++) {
569: n = ii[1] - ii[0];
570: ii++;
571: sum1 = 0.0;
572: sum2 = 0.0;
573: sum3 = 0.0;
574: sum4 = 0.0;
575: sum5 = 0.0;
576: sum6 = 0.0;
578: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580: for (j=0; j<n; j++) {
581: xb = x + 6*(*idx++);
582: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589: v += 36;
590: }
591: if (usecprow) z = zarray + 6*ridx[i];
592: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593: if (!usecprow) z += 6;
594: }
596: VecRestoreArrayRead(xx,&x);
597: VecRestoreArray(zz,&zarray);
598: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
599: return(0);
600: }
602: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603: {
604: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
605: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606: const PetscScalar *x,*xb;
607: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
608: const MatScalar *v;
609: PetscErrorCode ierr;
610: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
611: PetscBool usecprow=a->compressedrow.use;
614: VecGetArrayRead(xx,&x);
615: VecGetArray(zz,&zarray);
617: idx = a->j;
618: v = a->a;
619: if (usecprow) {
620: mbs = a->compressedrow.nrows;
621: ii = a->compressedrow.i;
622: ridx = a->compressedrow.rindex;
623: PetscArrayzero(zarray,7*a->mbs);
624: } else {
625: mbs = a->mbs;
626: ii = a->i;
627: z = zarray;
628: }
630: for (i=0; i<mbs; i++) {
631: n = ii[1] - ii[0];
632: ii++;
633: sum1 = 0.0;
634: sum2 = 0.0;
635: sum3 = 0.0;
636: sum4 = 0.0;
637: sum5 = 0.0;
638: sum6 = 0.0;
639: sum7 = 0.0;
641: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
642: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643: for (j=0; j<n; j++) {
644: xb = x + 7*(*idx++);
645: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653: v += 49;
654: }
655: if (usecprow) z = zarray + 7*ridx[i];
656: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657: if (!usecprow) z += 7;
658: }
660: VecRestoreArrayRead(xx,&x);
661: VecRestoreArray(zz,&zarray);
662: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
663: return(0);
664: }
666: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668: {
669: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
670: PetscScalar *z = 0,*work,*workt,*zarray;
671: const PetscScalar *x,*xb;
672: const MatScalar *v;
674: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675: const PetscInt *idx,*ii,*ridx=NULL;
676: PetscInt k;
677: PetscBool usecprow=a->compressedrow.use;
679: __m256d a0,a1,a2,a3,a4,a5;
680: __m256d w0,w1,w2,w3;
681: __m256d z0,z1,z2;
682: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
685: VecGetArrayRead(xx,&x);
686: VecGetArray(zz,&zarray);
688: idx = a->j;
689: v = a->a;
690: if (usecprow) {
691: mbs = a->compressedrow.nrows;
692: ii = a->compressedrow.i;
693: ridx = a->compressedrow.rindex;
694: PetscArrayzero(zarray,bs*a->mbs);
695: } else {
696: mbs = a->mbs;
697: ii = a->i;
698: z = zarray;
699: }
701: if (!a->mult_work) {
702: k = PetscMax(A->rmap->n,A->cmap->n);
703: PetscMalloc1(k+1,&a->mult_work);
704: }
706: work = a->mult_work;
707: for (i=0; i<mbs; i++) {
708: n = ii[1] - ii[0]; ii++;
709: workt = work;
710: for (j=0; j<n; j++) {
711: xb = x + bs*(*idx++);
712: for (k=0; k<bs; k++) workt[k] = xb[k];
713: workt += bs;
714: }
715: if (usecprow) z = zarray + bs*ridx[i];
717: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
719: for (j=0; j<n; j++) {
720: /* first column of a */
721: w0 = _mm256_set1_pd(work[j*9 ]);
722: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
726: /* second column of a */
727: w1 = _mm256_set1_pd(work[j*9+ 1]);
728: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
732: /* third column of a */
733: w2 = _mm256_set1_pd(work[j*9 +2]);
734: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
738: /* fourth column of a */
739: w3 = _mm256_set1_pd(work[j*9+ 3]);
740: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
744: /* fifth column of a */
745: w0 = _mm256_set1_pd(work[j*9+ 4]);
746: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
750: /* sixth column of a */
751: w1 = _mm256_set1_pd(work[j*9+ 5]);
752: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
756: /* seventh column of a */
757: w2 = _mm256_set1_pd(work[j*9+ 6]);
758: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
762: /* eigth column of a */
763: w3 = _mm256_set1_pd(work[j*9+ 7]);
764: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
768: /* ninth column of a */
769: w0 = _mm256_set1_pd(work[j*9+ 8]);
770: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773: }
775: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
777: v += n*bs2;
778: if (!usecprow) z += bs;
779: }
780: VecRestoreArrayRead(xx,&x);
781: VecRestoreArray(zz,&zarray);
782: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
783: return(0);
784: }
785: #endif
787: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788: {
789: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
790: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791: const PetscScalar *x,*xb;
792: PetscScalar *zarray,xv;
793: const MatScalar *v;
794: PetscErrorCode ierr;
795: const PetscInt *ii,*ij=a->j,*idx;
796: PetscInt mbs,i,j,k,n,*ridx=NULL;
797: PetscBool usecprow=a->compressedrow.use;
800: VecGetArrayRead(xx,&x);
801: VecGetArray(zz,&zarray);
803: v = a->a;
804: if (usecprow) {
805: mbs = a->compressedrow.nrows;
806: ii = a->compressedrow.i;
807: ridx = a->compressedrow.rindex;
808: PetscArrayzero(zarray,11*a->mbs);
809: } else {
810: mbs = a->mbs;
811: ii = a->i;
812: z = zarray;
813: }
815: for (i=0; i<mbs; i++) {
816: n = ii[i+1] - ii[i];
817: idx = ij + ii[i];
818: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
821: for (j=0; j<n; j++) {
822: xb = x + 11*(idx[j]);
824: for (k=0; k<11; k++) {
825: xv = xb[k];
826: sum1 += v[0]*xv;
827: sum2 += v[1]*xv;
828: sum3 += v[2]*xv;
829: sum4 += v[3]*xv;
830: sum5 += v[4]*xv;
831: sum6 += v[5]*xv;
832: sum7 += v[6]*xv;
833: sum8 += v[7]*xv;
834: sum9 += v[8]*xv;
835: sum10 += v[9]*xv;
836: sum11 += v[10]*xv;
837: v += 11;
838: }
839: }
840: if (usecprow) z = zarray + 11*ridx[i];
841: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
844: if (!usecprow) z += 11;
845: }
847: VecRestoreArrayRead(xx,&x);
848: VecRestoreArray(zz,&zarray);
849: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
850: return(0);
851: }
853: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854: /* Default MatMult for block size 15 */
855: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
856: {
857: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
858: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
859: const PetscScalar *x,*xb;
860: PetscScalar *zarray,xv;
861: const MatScalar *v;
862: PetscErrorCode ierr;
863: const PetscInt *ii,*ij=a->j,*idx;
864: PetscInt mbs,i,j,k,n,*ridx=NULL;
865: PetscBool usecprow=a->compressedrow.use;
868: VecGetArrayRead(xx,&x);
869: VecGetArray(zz,&zarray);
871: v = a->a;
872: if (usecprow) {
873: mbs = a->compressedrow.nrows;
874: ii = a->compressedrow.i;
875: ridx = a->compressedrow.rindex;
876: PetscArrayzero(zarray,15*a->mbs);
877: } else {
878: mbs = a->mbs;
879: ii = a->i;
880: z = zarray;
881: }
883: for (i=0; i<mbs; i++) {
884: n = ii[i+1] - ii[i];
885: idx = ij + ii[i];
886: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
887: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
889: for (j=0; j<n; j++) {
890: xb = x + 15*(idx[j]);
892: for (k=0; k<15; k++) {
893: xv = xb[k];
894: sum1 += v[0]*xv;
895: sum2 += v[1]*xv;
896: sum3 += v[2]*xv;
897: sum4 += v[3]*xv;
898: sum5 += v[4]*xv;
899: sum6 += v[5]*xv;
900: sum7 += v[6]*xv;
901: sum8 += v[7]*xv;
902: sum9 += v[8]*xv;
903: sum10 += v[9]*xv;
904: sum11 += v[10]*xv;
905: sum12 += v[11]*xv;
906: sum13 += v[12]*xv;
907: sum14 += v[13]*xv;
908: sum15 += v[14]*xv;
909: v += 15;
910: }
911: }
912: if (usecprow) z = zarray + 15*ridx[i];
913: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
914: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
916: if (!usecprow) z += 15;
917: }
919: VecRestoreArrayRead(xx,&x);
920: VecRestoreArray(zz,&zarray);
921: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
922: return(0);
923: }
925: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
926: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
927: {
928: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
929: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
930: const PetscScalar *x,*xb;
931: PetscScalar x1,x2,x3,x4,*zarray;
932: const MatScalar *v;
933: PetscErrorCode ierr;
934: const PetscInt *ii,*ij=a->j,*idx;
935: PetscInt mbs,i,j,n,*ridx=NULL;
936: PetscBool usecprow=a->compressedrow.use;
939: VecGetArrayRead(xx,&x);
940: VecGetArray(zz,&zarray);
942: v = a->a;
943: if (usecprow) {
944: mbs = a->compressedrow.nrows;
945: ii = a->compressedrow.i;
946: ridx = a->compressedrow.rindex;
947: PetscArrayzero(zarray,15*a->mbs);
948: } else {
949: mbs = a->mbs;
950: ii = a->i;
951: z = zarray;
952: }
954: for (i=0; i<mbs; i++) {
955: n = ii[i+1] - ii[i];
956: idx = ij + ii[i];
957: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
960: for (j=0; j<n; j++) {
961: xb = x + 15*(idx[j]);
962: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
964: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
965: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
966: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
967: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
968: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
969: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
970: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
971: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
972: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
973: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
974: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
975: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
976: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
977: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
978: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
980: v += 60;
982: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
984: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
985: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
986: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
987: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
988: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
989: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
990: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
991: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
992: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
993: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
994: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
995: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
996: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
997: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
998: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
999: v += 60;
1001: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1002: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1003: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1004: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1005: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1006: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1007: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1008: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1009: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1010: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1011: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1012: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1013: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1014: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1015: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1016: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1017: v += 60;
1019: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1020: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1021: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1022: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1023: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1024: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1025: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1026: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1027: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1028: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1029: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1030: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1031: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1032: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1033: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1034: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1035: v += 45;
1036: }
1037: if (usecprow) z = zarray + 15*ridx[i];
1038: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1039: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1041: if (!usecprow) z += 15;
1042: }
1044: VecRestoreArrayRead(xx,&x);
1045: VecRestoreArray(zz,&zarray);
1046: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1047: return(0);
1048: }
1050: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1051: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1052: {
1053: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1054: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1055: const PetscScalar *x,*xb;
1056: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1057: const MatScalar *v;
1058: PetscErrorCode ierr;
1059: const PetscInt *ii,*ij=a->j,*idx;
1060: PetscInt mbs,i,j,n,*ridx=NULL;
1061: PetscBool usecprow=a->compressedrow.use;
1064: VecGetArrayRead(xx,&x);
1065: VecGetArray(zz,&zarray);
1067: v = a->a;
1068: if (usecprow) {
1069: mbs = a->compressedrow.nrows;
1070: ii = a->compressedrow.i;
1071: ridx = a->compressedrow.rindex;
1072: PetscArrayzero(zarray,15*a->mbs);
1073: } else {
1074: mbs = a->mbs;
1075: ii = a->i;
1076: z = zarray;
1077: }
1079: for (i=0; i<mbs; i++) {
1080: n = ii[i+1] - ii[i];
1081: idx = ij + ii[i];
1082: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1083: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1085: for (j=0; j<n; j++) {
1086: xb = x + 15*(idx[j]);
1087: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1088: x8 = xb[7];
1090: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1091: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1092: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1093: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1094: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1095: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1096: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1097: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1098: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1099: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1100: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1101: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1102: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1103: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1104: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1105: v += 120;
1107: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1109: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1110: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1111: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1112: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1113: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1114: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1115: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1116: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1117: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1118: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1119: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1120: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1121: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1122: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1123: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1124: v += 105;
1125: }
1126: if (usecprow) z = zarray + 15*ridx[i];
1127: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1128: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1130: if (!usecprow) z += 15;
1131: }
1133: VecRestoreArrayRead(xx,&x);
1134: VecRestoreArray(zz,&zarray);
1135: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1136: return(0);
1137: }
1139: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1140: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1141: {
1142: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1143: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1144: const PetscScalar *x,*xb;
1145: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1146: const MatScalar *v;
1147: PetscErrorCode ierr;
1148: const PetscInt *ii,*ij=a->j,*idx;
1149: PetscInt mbs,i,j,n,*ridx=NULL;
1150: PetscBool usecprow=a->compressedrow.use;
1153: VecGetArrayRead(xx,&x);
1154: VecGetArray(zz,&zarray);
1156: v = a->a;
1157: if (usecprow) {
1158: mbs = a->compressedrow.nrows;
1159: ii = a->compressedrow.i;
1160: ridx = a->compressedrow.rindex;
1161: PetscArrayzero(zarray,15*a->mbs);
1162: } else {
1163: mbs = a->mbs;
1164: ii = a->i;
1165: z = zarray;
1166: }
1168: for (i=0; i<mbs; i++) {
1169: n = ii[i+1] - ii[i];
1170: idx = ij + ii[i];
1171: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1172: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1174: for (j=0; j<n; j++) {
1175: xb = x + 15*(idx[j]);
1176: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1177: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1179: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1180: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1181: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1182: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1183: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1184: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1185: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1186: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1187: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1188: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1189: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1190: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1191: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1192: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1193: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1194: v += 225;
1195: }
1196: if (usecprow) z = zarray + 15*ridx[i];
1197: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1198: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1200: if (!usecprow) z += 15;
1201: }
1203: VecRestoreArrayRead(xx,&x);
1204: VecRestoreArray(zz,&zarray);
1205: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1206: return(0);
1207: }
1209: /*
1210: This will not work with MatScalar == float because it calls the BLAS
1211: */
1212: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1213: {
1214: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1215: PetscScalar *z = 0,*work,*workt,*zarray;
1216: const PetscScalar *x,*xb;
1217: const MatScalar *v;
1218: PetscErrorCode ierr;
1219: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1220: const PetscInt *idx,*ii,*ridx=NULL;
1221: PetscInt ncols,k;
1222: PetscBool usecprow=a->compressedrow.use;
1225: VecGetArrayRead(xx,&x);
1226: VecGetArray(zz,&zarray);
1228: idx = a->j;
1229: v = a->a;
1230: if (usecprow) {
1231: mbs = a->compressedrow.nrows;
1232: ii = a->compressedrow.i;
1233: ridx = a->compressedrow.rindex;
1234: PetscArrayzero(zarray,bs*a->mbs);
1235: } else {
1236: mbs = a->mbs;
1237: ii = a->i;
1238: z = zarray;
1239: }
1241: if (!a->mult_work) {
1242: k = PetscMax(A->rmap->n,A->cmap->n);
1243: PetscMalloc1(k+1,&a->mult_work);
1244: }
1245: work = a->mult_work;
1246: for (i=0; i<mbs; i++) {
1247: n = ii[1] - ii[0]; ii++;
1248: ncols = n*bs;
1249: workt = work;
1250: for (j=0; j<n; j++) {
1251: xb = x + bs*(*idx++);
1252: for (k=0; k<bs; k++) workt[k] = xb[k];
1253: workt += bs;
1254: }
1255: if (usecprow) z = zarray + bs*ridx[i];
1256: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1257: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1258: v += n*bs2;
1259: if (!usecprow) z += bs;
1260: }
1261: VecRestoreArrayRead(xx,&x);
1262: VecRestoreArray(zz,&zarray);
1263: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1264: return(0);
1265: }
1267: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1268: {
1269: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1270: const PetscScalar *x;
1271: PetscScalar *y,*z,sum;
1272: const MatScalar *v;
1273: PetscErrorCode ierr;
1274: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1275: const PetscInt *idx,*ii;
1276: PetscBool usecprow=a->compressedrow.use;
1279: VecGetArrayRead(xx,&x);
1280: VecGetArrayPair(yy,zz,&y,&z);
1282: idx = a->j;
1283: v = a->a;
1284: if (usecprow) {
1285: if (zz != yy) {
1286: PetscArraycpy(z,y,mbs);
1287: }
1288: mbs = a->compressedrow.nrows;
1289: ii = a->compressedrow.i;
1290: ridx = a->compressedrow.rindex;
1291: } else {
1292: ii = a->i;
1293: }
1295: for (i=0; i<mbs; i++) {
1296: n = ii[1] - ii[0];
1297: ii++;
1298: if (!usecprow) {
1299: sum = y[i];
1300: } else {
1301: sum = y[ridx[i]];
1302: }
1303: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1304: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1305: PetscSparseDensePlusDot(sum,x,v,idx,n);
1306: v += n;
1307: idx += n;
1308: if (usecprow) {
1309: z[ridx[i]] = sum;
1310: } else {
1311: z[i] = sum;
1312: }
1313: }
1314: VecRestoreArrayRead(xx,&x);
1315: VecRestoreArrayPair(yy,zz,&y,&z);
1316: PetscLogFlops(2.0*a->nz);
1317: return(0);
1318: }
1320: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1321: {
1322: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1323: PetscScalar *y = 0,*z = 0,sum1,sum2;
1324: const PetscScalar *x,*xb;
1325: PetscScalar x1,x2,*yarray,*zarray;
1326: const MatScalar *v;
1327: PetscErrorCode ierr;
1328: PetscInt mbs = a->mbs,i,n,j;
1329: const PetscInt *idx,*ii,*ridx = NULL;
1330: PetscBool usecprow = a->compressedrow.use;
1333: VecGetArrayRead(xx,&x);
1334: VecGetArrayPair(yy,zz,&yarray,&zarray);
1336: idx = a->j;
1337: v = a->a;
1338: if (usecprow) {
1339: if (zz != yy) {
1340: PetscArraycpy(zarray,yarray,2*mbs);
1341: }
1342: mbs = a->compressedrow.nrows;
1343: ii = a->compressedrow.i;
1344: ridx = a->compressedrow.rindex;
1345: } else {
1346: ii = a->i;
1347: y = yarray;
1348: z = zarray;
1349: }
1351: for (i=0; i<mbs; i++) {
1352: n = ii[1] - ii[0]; ii++;
1353: if (usecprow) {
1354: z = zarray + 2*ridx[i];
1355: y = yarray + 2*ridx[i];
1356: }
1357: sum1 = y[0]; sum2 = y[1];
1358: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1359: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1360: for (j=0; j<n; j++) {
1361: xb = x + 2*(*idx++);
1362: x1 = xb[0];
1363: x2 = xb[1];
1365: sum1 += v[0]*x1 + v[2]*x2;
1366: sum2 += v[1]*x1 + v[3]*x2;
1367: v += 4;
1368: }
1369: z[0] = sum1; z[1] = sum2;
1370: if (!usecprow) {
1371: z += 2; y += 2;
1372: }
1373: }
1374: VecRestoreArrayRead(xx,&x);
1375: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1376: PetscLogFlops(4.0*a->nz);
1377: return(0);
1378: }
1380: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1381: {
1382: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1383: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1384: const PetscScalar *x,*xb;
1385: const MatScalar *v;
1386: PetscErrorCode ierr;
1387: PetscInt mbs = a->mbs,i,j,n;
1388: const PetscInt *idx,*ii,*ridx = NULL;
1389: PetscBool usecprow = a->compressedrow.use;
1392: VecGetArrayRead(xx,&x);
1393: VecGetArrayPair(yy,zz,&yarray,&zarray);
1395: idx = a->j;
1396: v = a->a;
1397: if (usecprow) {
1398: if (zz != yy) {
1399: PetscArraycpy(zarray,yarray,3*mbs);
1400: }
1401: mbs = a->compressedrow.nrows;
1402: ii = a->compressedrow.i;
1403: ridx = a->compressedrow.rindex;
1404: } else {
1405: ii = a->i;
1406: y = yarray;
1407: z = zarray;
1408: }
1410: for (i=0; i<mbs; i++) {
1411: n = ii[1] - ii[0]; ii++;
1412: if (usecprow) {
1413: z = zarray + 3*ridx[i];
1414: y = yarray + 3*ridx[i];
1415: }
1416: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1417: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1418: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1419: for (j=0; j<n; j++) {
1420: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1421: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1422: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1423: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1424: v += 9;
1425: }
1426: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1427: if (!usecprow) {
1428: z += 3; y += 3;
1429: }
1430: }
1431: VecRestoreArrayRead(xx,&x);
1432: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1433: PetscLogFlops(18.0*a->nz);
1434: return(0);
1435: }
1437: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1438: {
1439: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1440: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1441: const PetscScalar *x,*xb;
1442: const MatScalar *v;
1443: PetscErrorCode ierr;
1444: PetscInt mbs = a->mbs,i,j,n;
1445: const PetscInt *idx,*ii,*ridx=NULL;
1446: PetscBool usecprow=a->compressedrow.use;
1449: VecGetArrayRead(xx,&x);
1450: VecGetArrayPair(yy,zz,&yarray,&zarray);
1452: idx = a->j;
1453: v = a->a;
1454: if (usecprow) {
1455: if (zz != yy) {
1456: PetscArraycpy(zarray,yarray,4*mbs);
1457: }
1458: mbs = a->compressedrow.nrows;
1459: ii = a->compressedrow.i;
1460: ridx = a->compressedrow.rindex;
1461: } else {
1462: ii = a->i;
1463: y = yarray;
1464: z = zarray;
1465: }
1467: for (i=0; i<mbs; i++) {
1468: n = ii[1] - ii[0]; ii++;
1469: if (usecprow) {
1470: z = zarray + 4*ridx[i];
1471: y = yarray + 4*ridx[i];
1472: }
1473: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1474: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1475: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1476: for (j=0; j<n; j++) {
1477: xb = x + 4*(*idx++);
1478: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1479: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1480: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1481: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1482: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1483: v += 16;
1484: }
1485: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1486: if (!usecprow) {
1487: z += 4; y += 4;
1488: }
1489: }
1490: VecRestoreArrayRead(xx,&x);
1491: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1492: PetscLogFlops(32.0*a->nz);
1493: return(0);
1494: }
1496: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1497: {
1498: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1499: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1500: const PetscScalar *x,*xb;
1501: PetscScalar *yarray,*zarray;
1502: const MatScalar *v;
1503: PetscErrorCode ierr;
1504: PetscInt mbs = a->mbs,i,j,n;
1505: const PetscInt *idx,*ii,*ridx = NULL;
1506: PetscBool usecprow=a->compressedrow.use;
1509: VecGetArrayRead(xx,&x);
1510: VecGetArrayPair(yy,zz,&yarray,&zarray);
1512: idx = a->j;
1513: v = a->a;
1514: if (usecprow) {
1515: if (zz != yy) {
1516: PetscArraycpy(zarray,yarray,5*mbs);
1517: }
1518: mbs = a->compressedrow.nrows;
1519: ii = a->compressedrow.i;
1520: ridx = a->compressedrow.rindex;
1521: } else {
1522: ii = a->i;
1523: y = yarray;
1524: z = zarray;
1525: }
1527: for (i=0; i<mbs; i++) {
1528: n = ii[1] - ii[0]; ii++;
1529: if (usecprow) {
1530: z = zarray + 5*ridx[i];
1531: y = yarray + 5*ridx[i];
1532: }
1533: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1534: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1535: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536: for (j=0; j<n; j++) {
1537: xb = x + 5*(*idx++);
1538: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1539: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1540: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1541: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1542: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1543: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1544: v += 25;
1545: }
1546: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1547: if (!usecprow) {
1548: z += 5; y += 5;
1549: }
1550: }
1551: VecRestoreArrayRead(xx,&x);
1552: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1553: PetscLogFlops(50.0*a->nz);
1554: return(0);
1555: }
1557: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1558: {
1559: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1560: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1561: const PetscScalar *x,*xb;
1562: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1563: const MatScalar *v;
1564: PetscErrorCode ierr;
1565: PetscInt mbs = a->mbs,i,j,n;
1566: const PetscInt *idx,*ii,*ridx=NULL;
1567: PetscBool usecprow=a->compressedrow.use;
1570: VecGetArrayRead(xx,&x);
1571: VecGetArrayPair(yy,zz,&yarray,&zarray);
1573: idx = a->j;
1574: v = a->a;
1575: if (usecprow) {
1576: if (zz != yy) {
1577: PetscArraycpy(zarray,yarray,6*mbs);
1578: }
1579: mbs = a->compressedrow.nrows;
1580: ii = a->compressedrow.i;
1581: ridx = a->compressedrow.rindex;
1582: } else {
1583: ii = a->i;
1584: y = yarray;
1585: z = zarray;
1586: }
1588: for (i=0; i<mbs; i++) {
1589: n = ii[1] - ii[0]; ii++;
1590: if (usecprow) {
1591: z = zarray + 6*ridx[i];
1592: y = yarray + 6*ridx[i];
1593: }
1594: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1595: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1596: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597: for (j=0; j<n; j++) {
1598: xb = x + 6*(*idx++);
1599: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1600: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1601: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1602: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1603: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1604: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1605: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1606: v += 36;
1607: }
1608: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1609: if (!usecprow) {
1610: z += 6; y += 6;
1611: }
1612: }
1613: VecRestoreArrayRead(xx,&x);
1614: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1615: PetscLogFlops(72.0*a->nz);
1616: return(0);
1617: }
1619: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1620: {
1621: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1622: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1623: const PetscScalar *x,*xb;
1624: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1625: const MatScalar *v;
1626: PetscErrorCode ierr;
1627: PetscInt mbs = a->mbs,i,j,n;
1628: const PetscInt *idx,*ii,*ridx = NULL;
1629: PetscBool usecprow=a->compressedrow.use;
1632: VecGetArrayRead(xx,&x);
1633: VecGetArrayPair(yy,zz,&yarray,&zarray);
1635: idx = a->j;
1636: v = a->a;
1637: if (usecprow) {
1638: if (zz != yy) {
1639: PetscArraycpy(zarray,yarray,7*mbs);
1640: }
1641: mbs = a->compressedrow.nrows;
1642: ii = a->compressedrow.i;
1643: ridx = a->compressedrow.rindex;
1644: } else {
1645: ii = a->i;
1646: y = yarray;
1647: z = zarray;
1648: }
1650: for (i=0; i<mbs; i++) {
1651: n = ii[1] - ii[0]; ii++;
1652: if (usecprow) {
1653: z = zarray + 7*ridx[i];
1654: y = yarray + 7*ridx[i];
1655: }
1656: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1657: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1658: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1659: for (j=0; j<n; j++) {
1660: xb = x + 7*(*idx++);
1661: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1662: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1663: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1664: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1665: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1666: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1667: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1668: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1669: v += 49;
1670: }
1671: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1672: if (!usecprow) {
1673: z += 7; y += 7;
1674: }
1675: }
1676: VecRestoreArrayRead(xx,&x);
1677: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1678: PetscLogFlops(98.0*a->nz);
1679: return(0);
1680: }
1682: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1683: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1684: {
1685: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1686: PetscScalar *z = 0,*work,*workt,*zarray;
1687: const PetscScalar *x,*xb;
1688: const MatScalar *v;
1690: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691: PetscInt k;
1692: PetscBool usecprow=a->compressedrow.use;
1693: const PetscInt *idx,*ii,*ridx=NULL;
1695: __m256d a0,a1,a2,a3,a4,a5;
1696: __m256d w0,w1,w2,w3;
1697: __m256d z0,z1,z2;
1698: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1701: VecCopy(yy,zz);
1702: VecGetArrayRead(xx,&x);
1703: VecGetArray(zz,&zarray);
1705: idx = a->j;
1706: v = a->a;
1707: if (usecprow) {
1708: mbs = a->compressedrow.nrows;
1709: ii = a->compressedrow.i;
1710: ridx = a->compressedrow.rindex;
1711: } else {
1712: mbs = a->mbs;
1713: ii = a->i;
1714: z = zarray;
1715: }
1717: if (!a->mult_work) {
1718: k = PetscMax(A->rmap->n,A->cmap->n);
1719: PetscMalloc1(k+1,&a->mult_work);
1720: }
1722: work = a->mult_work;
1723: for (i=0; i<mbs; i++) {
1724: n = ii[1] - ii[0]; ii++;
1725: workt = work;
1726: for (j=0; j<n; j++) {
1727: xb = x + bs*(*idx++);
1728: for (k=0; k<bs; k++) workt[k] = xb[k];
1729: workt += bs;
1730: }
1731: if (usecprow) z = zarray + bs*ridx[i];
1733: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1735: for (j=0; j<n; j++) {
1736: /* first column of a */
1737: w0 = _mm256_set1_pd(work[j*9 ]);
1738: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1739: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1740: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1742: /* second column of a */
1743: w1 = _mm256_set1_pd(work[j*9+ 1]);
1744: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1745: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1746: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1748: /* third column of a */
1749: w2 = _mm256_set1_pd(work[j*9+ 2]);
1750: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1751: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1752: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1754: /* fourth column of a */
1755: w3 = _mm256_set1_pd(work[j*9+ 3]);
1756: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1757: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1758: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1760: /* fifth column of a */
1761: w0 = _mm256_set1_pd(work[j*9+ 4]);
1762: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1763: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1764: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1766: /* sixth column of a */
1767: w1 = _mm256_set1_pd(work[j*9+ 5]);
1768: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1769: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1770: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1772: /* seventh column of a */
1773: w2 = _mm256_set1_pd(work[j*9+ 6]);
1774: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1775: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1776: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1778: /* eigth column of a */
1779: w3 = _mm256_set1_pd(work[j*9+ 7]);
1780: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1781: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1782: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1784: /* ninth column of a */
1785: w0 = _mm256_set1_pd(work[j*9+ 8]);
1786: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1787: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1788: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1789: }
1791: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1793: v += n*bs2;
1794: if (!usecprow) z += bs;
1795: }
1796: VecRestoreArrayRead(xx,&x);
1797: VecRestoreArray(zz,&zarray);
1798: PetscLogFlops(162.0*a->nz);
1799: return(0);
1800: }
1801: #endif
1803: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1804: {
1805: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1806: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1807: const PetscScalar *x,*xb;
1808: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1809: const MatScalar *v;
1810: PetscErrorCode ierr;
1811: PetscInt mbs = a->mbs,i,j,n;
1812: const PetscInt *idx,*ii,*ridx = NULL;
1813: PetscBool usecprow=a->compressedrow.use;
1816: VecGetArrayRead(xx,&x);
1817: VecGetArrayPair(yy,zz,&yarray,&zarray);
1819: idx = a->j;
1820: v = a->a;
1821: if (usecprow) {
1822: if (zz != yy) {
1823: PetscArraycpy(zarray,yarray,7*mbs);
1824: }
1825: mbs = a->compressedrow.nrows;
1826: ii = a->compressedrow.i;
1827: ridx = a->compressedrow.rindex;
1828: } else {
1829: ii = a->i;
1830: y = yarray;
1831: z = zarray;
1832: }
1834: for (i=0; i<mbs; i++) {
1835: n = ii[1] - ii[0]; ii++;
1836: if (usecprow) {
1837: z = zarray + 11*ridx[i];
1838: y = yarray + 11*ridx[i];
1839: }
1840: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1841: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1842: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1843: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1844: for (j=0; j<n; j++) {
1845: xb = x + 11*(*idx++);
1846: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1847: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1848: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1849: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1850: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1851: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1852: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1853: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1854: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1855: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1856: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1857: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1858: v += 121;
1859: }
1860: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1861: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1862: if (!usecprow) {
1863: z += 11; y += 11;
1864: }
1865: }
1866: VecRestoreArrayRead(xx,&x);
1867: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1868: PetscLogFlops(242.0*a->nz);
1869: return(0);
1870: }
1872: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1873: {
1874: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1875: PetscScalar *z = 0,*work,*workt,*zarray;
1876: const PetscScalar *x,*xb;
1877: const MatScalar *v;
1878: PetscErrorCode ierr;
1879: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1880: PetscInt ncols,k;
1881: const PetscInt *ridx = NULL,*idx,*ii;
1882: PetscBool usecprow = a->compressedrow.use;
1885: VecCopy(yy,zz);
1886: VecGetArrayRead(xx,&x);
1887: VecGetArray(zz,&zarray);
1889: idx = a->j;
1890: v = a->a;
1891: if (usecprow) {
1892: mbs = a->compressedrow.nrows;
1893: ii = a->compressedrow.i;
1894: ridx = a->compressedrow.rindex;
1895: } else {
1896: mbs = a->mbs;
1897: ii = a->i;
1898: z = zarray;
1899: }
1901: if (!a->mult_work) {
1902: k = PetscMax(A->rmap->n,A->cmap->n);
1903: PetscMalloc1(k+1,&a->mult_work);
1904: }
1905: work = a->mult_work;
1906: for (i=0; i<mbs; i++) {
1907: n = ii[1] - ii[0]; ii++;
1908: ncols = n*bs;
1909: workt = work;
1910: for (j=0; j<n; j++) {
1911: xb = x + bs*(*idx++);
1912: for (k=0; k<bs; k++) workt[k] = xb[k];
1913: workt += bs;
1914: }
1915: if (usecprow) z = zarray + bs*ridx[i];
1916: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1917: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1918: v += n*bs2;
1919: if (!usecprow) z += bs;
1920: }
1921: VecRestoreArrayRead(xx,&x);
1922: VecRestoreArray(zz,&zarray);
1923: PetscLogFlops(2.0*a->nz*bs2);
1924: return(0);
1925: }
1927: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1928: {
1929: PetscScalar zero = 0.0;
1933: VecSet(zz,zero);
1934: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1935: return(0);
1936: }
1938: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1939: {
1940: PetscScalar zero = 0.0;
1944: VecSet(zz,zero);
1945: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1946: return(0);
1947: }
1949: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1950: {
1951: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1952: PetscScalar *z,x1,x2,x3,x4,x5;
1953: const PetscScalar *x,*xb = NULL;
1954: const MatScalar *v;
1955: PetscErrorCode ierr;
1956: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1957: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1958: Mat_CompressedRow cprow = a->compressedrow;
1959: PetscBool usecprow = cprow.use;
1962: if (yy != zz) { VecCopy(yy,zz); }
1963: VecGetArrayRead(xx,&x);
1964: VecGetArray(zz,&z);
1966: idx = a->j;
1967: v = a->a;
1968: if (usecprow) {
1969: mbs = cprow.nrows;
1970: ii = cprow.i;
1971: ridx = cprow.rindex;
1972: } else {
1973: mbs=a->mbs;
1974: ii = a->i;
1975: xb = x;
1976: }
1978: switch (bs) {
1979: case 1:
1980: for (i=0; i<mbs; i++) {
1981: if (usecprow) xb = x + ridx[i];
1982: x1 = xb[0];
1983: ib = idx + ii[0];
1984: n = ii[1] - ii[0]; ii++;
1985: for (j=0; j<n; j++) {
1986: rval = ib[j];
1987: z[rval] += PetscConj(*v) * x1;
1988: v++;
1989: }
1990: if (!usecprow) xb++;
1991: }
1992: break;
1993: case 2:
1994: for (i=0; i<mbs; i++) {
1995: if (usecprow) xb = x + 2*ridx[i];
1996: x1 = xb[0]; x2 = xb[1];
1997: ib = idx + ii[0];
1998: n = ii[1] - ii[0]; ii++;
1999: for (j=0; j<n; j++) {
2000: rval = ib[j]*2;
2001: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2002: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2003: v += 4;
2004: }
2005: if (!usecprow) xb += 2;
2006: }
2007: break;
2008: case 3:
2009: for (i=0; i<mbs; i++) {
2010: if (usecprow) xb = x + 3*ridx[i];
2011: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2012: ib = idx + ii[0];
2013: n = ii[1] - ii[0]; ii++;
2014: for (j=0; j<n; j++) {
2015: rval = ib[j]*3;
2016: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2017: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2018: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2019: v += 9;
2020: }
2021: if (!usecprow) xb += 3;
2022: }
2023: break;
2024: case 4:
2025: for (i=0; i<mbs; i++) {
2026: if (usecprow) xb = x + 4*ridx[i];
2027: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2028: ib = idx + ii[0];
2029: n = ii[1] - ii[0]; ii++;
2030: for (j=0; j<n; j++) {
2031: rval = ib[j]*4;
2032: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2033: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2034: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2035: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2036: v += 16;
2037: }
2038: if (!usecprow) xb += 4;
2039: }
2040: break;
2041: case 5:
2042: for (i=0; i<mbs; i++) {
2043: if (usecprow) xb = x + 5*ridx[i];
2044: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2045: x4 = xb[3]; x5 = xb[4];
2046: ib = idx + ii[0];
2047: n = ii[1] - ii[0]; ii++;
2048: for (j=0; j<n; j++) {
2049: rval = ib[j]*5;
2050: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2051: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2052: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2053: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2054: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2055: v += 25;
2056: }
2057: if (!usecprow) xb += 5;
2058: }
2059: break;
2060: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2061: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2062: #if 0
2063: {
2064: PetscInt ncols,k,bs2=a->bs2;
2065: PetscScalar *work,*workt,zb;
2066: const PetscScalar *xtmp;
2067: if (!a->mult_work) {
2068: k = PetscMax(A->rmap->n,A->cmap->n);
2069: PetscMalloc1(k+1,&a->mult_work);
2070: }
2071: work = a->mult_work;
2072: xtmp = x;
2073: for (i=0; i<mbs; i++) {
2074: n = ii[1] - ii[0]; ii++;
2075: ncols = n*bs;
2076: PetscArrayzero(work,ncols);
2077: if (usecprow) xtmp = x + bs*ridx[i];
2078: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2079: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2080: v += n*bs2;
2081: if (!usecprow) xtmp += bs;
2082: workt = work;
2083: for (j=0; j<n; j++) {
2084: zb = z + bs*(*idx++);
2085: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2086: workt += bs;
2087: }
2088: }
2089: }
2090: #endif
2091: }
2092: VecRestoreArrayRead(xx,&x);
2093: VecRestoreArray(zz,&z);
2094: PetscLogFlops(2.0*a->nz*a->bs2);
2095: return(0);
2096: }
2098: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2099: {
2100: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2101: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2102: const PetscScalar *x,*xb = 0;
2103: const MatScalar *v;
2104: PetscErrorCode ierr;
2105: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2106: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2107: Mat_CompressedRow cprow = a->compressedrow;
2108: PetscBool usecprow=cprow.use;
2111: if (yy != zz) { VecCopy(yy,zz); }
2112: VecGetArrayRead(xx,&x);
2113: VecGetArray(zz,&z);
2115: idx = a->j;
2116: v = a->a;
2117: if (usecprow) {
2118: mbs = cprow.nrows;
2119: ii = cprow.i;
2120: ridx = cprow.rindex;
2121: } else {
2122: mbs=a->mbs;
2123: ii = a->i;
2124: xb = x;
2125: }
2127: switch (bs) {
2128: case 1:
2129: for (i=0; i<mbs; i++) {
2130: if (usecprow) xb = x + ridx[i];
2131: x1 = xb[0];
2132: ib = idx + ii[0];
2133: n = ii[1] - ii[0]; ii++;
2134: for (j=0; j<n; j++) {
2135: rval = ib[j];
2136: z[rval] += *v * x1;
2137: v++;
2138: }
2139: if (!usecprow) xb++;
2140: }
2141: break;
2142: case 2:
2143: for (i=0; i<mbs; i++) {
2144: if (usecprow) xb = x + 2*ridx[i];
2145: x1 = xb[0]; x2 = xb[1];
2146: ib = idx + ii[0];
2147: n = ii[1] - ii[0]; ii++;
2148: for (j=0; j<n; j++) {
2149: rval = ib[j]*2;
2150: z[rval++] += v[0]*x1 + v[1]*x2;
2151: z[rval++] += v[2]*x1 + v[3]*x2;
2152: v += 4;
2153: }
2154: if (!usecprow) xb += 2;
2155: }
2156: break;
2157: case 3:
2158: for (i=0; i<mbs; i++) {
2159: if (usecprow) xb = x + 3*ridx[i];
2160: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2161: ib = idx + ii[0];
2162: n = ii[1] - ii[0]; ii++;
2163: for (j=0; j<n; j++) {
2164: rval = ib[j]*3;
2165: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2166: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2167: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2168: v += 9;
2169: }
2170: if (!usecprow) xb += 3;
2171: }
2172: break;
2173: case 4:
2174: for (i=0; i<mbs; i++) {
2175: if (usecprow) xb = x + 4*ridx[i];
2176: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2177: ib = idx + ii[0];
2178: n = ii[1] - ii[0]; ii++;
2179: for (j=0; j<n; j++) {
2180: rval = ib[j]*4;
2181: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2182: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2183: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2184: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2185: v += 16;
2186: }
2187: if (!usecprow) xb += 4;
2188: }
2189: break;
2190: case 5:
2191: for (i=0; i<mbs; i++) {
2192: if (usecprow) xb = x + 5*ridx[i];
2193: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2194: x4 = xb[3]; x5 = xb[4];
2195: ib = idx + ii[0];
2196: n = ii[1] - ii[0]; ii++;
2197: for (j=0; j<n; j++) {
2198: rval = ib[j]*5;
2199: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2200: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2201: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2202: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2203: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2204: v += 25;
2205: }
2206: if (!usecprow) xb += 5;
2207: }
2208: break;
2209: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2210: PetscInt ncols,k;
2211: PetscScalar *work,*workt;
2212: const PetscScalar *xtmp;
2213: if (!a->mult_work) {
2214: k = PetscMax(A->rmap->n,A->cmap->n);
2215: PetscMalloc1(k+1,&a->mult_work);
2216: }
2217: work = a->mult_work;
2218: xtmp = x;
2219: for (i=0; i<mbs; i++) {
2220: n = ii[1] - ii[0]; ii++;
2221: ncols = n*bs;
2222: PetscArrayzero(work,ncols);
2223: if (usecprow) xtmp = x + bs*ridx[i];
2224: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2225: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2226: v += n*bs2;
2227: if (!usecprow) xtmp += bs;
2228: workt = work;
2229: for (j=0; j<n; j++) {
2230: zb = z + bs*(*idx++);
2231: for (k=0; k<bs; k++) zb[k] += workt[k];
2232: workt += bs;
2233: }
2234: }
2235: }
2236: }
2237: VecRestoreArrayRead(xx,&x);
2238: VecRestoreArray(zz,&z);
2239: PetscLogFlops(2.0*a->nz*a->bs2);
2240: return(0);
2241: }
2243: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2244: {
2245: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2246: PetscInt totalnz = a->bs2*a->nz;
2247: PetscScalar oalpha = alpha;
2249: PetscBLASInt one = 1,tnz;
2252: PetscBLASIntCast(totalnz,&tnz);
2253: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2254: PetscLogFlops(totalnz);
2255: return(0);
2256: }
2258: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2259: {
2261: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2262: MatScalar *v = a->a;
2263: PetscReal sum = 0.0;
2264: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2267: if (type == NORM_FROBENIUS) {
2268: #if defined(PETSC_USE_REAL___FP16)
2269: PetscBLASInt one = 1,cnt = bs2*nz;
2270: *norm = BLASnrm2_(&cnt,v,&one);
2271: #else
2272: for (i=0; i<bs2*nz; i++) {
2273: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2274: }
2275: #endif
2276: *norm = PetscSqrtReal(sum);
2277: PetscLogFlops(2.0*bs2*nz);
2278: } else if (type == NORM_1) { /* maximum column sum */
2279: PetscReal *tmp;
2280: PetscInt *bcol = a->j;
2281: PetscCalloc1(A->cmap->n+1,&tmp);
2282: for (i=0; i<nz; i++) {
2283: for (j=0; j<bs; j++) {
2284: k1 = bs*(*bcol) + j; /* column index */
2285: for (k=0; k<bs; k++) {
2286: tmp[k1] += PetscAbsScalar(*v); v++;
2287: }
2288: }
2289: bcol++;
2290: }
2291: *norm = 0.0;
2292: for (j=0; j<A->cmap->n; j++) {
2293: if (tmp[j] > *norm) *norm = tmp[j];
2294: }
2295: PetscFree(tmp);
2296: PetscLogFlops(PetscMax(bs2*nz-1,0));
2297: } else if (type == NORM_INFINITY) { /* maximum row sum */
2298: *norm = 0.0;
2299: for (k=0; k<bs; k++) {
2300: for (j=0; j<a->mbs; j++) {
2301: v = a->a + bs2*a->i[j] + k;
2302: sum = 0.0;
2303: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2304: for (k1=0; k1<bs; k1++) {
2305: sum += PetscAbsScalar(*v);
2306: v += bs;
2307: }
2308: }
2309: if (sum > *norm) *norm = sum;
2310: }
2311: }
2312: PetscLogFlops(PetscMax(bs2*nz-1,0));
2313: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2314: return(0);
2315: }
2318: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2319: {
2320: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2324: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2325: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2326: *flg = PETSC_FALSE;
2327: return(0);
2328: }
2330: /* if the a->i are the same */
2331: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2332: if (!*flg) return(0);
2334: /* if a->j are the same */
2335: PetscArraycmp(a->j,b->j,a->nz,flg);
2336: if (!*flg) return(0);
2338: /* if a->a are the same */
2339: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2340: return(0);
2342: }
2344: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2345: {
2346: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2348: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2349: PetscScalar *x,zero = 0.0;
2350: MatScalar *aa,*aa_j;
2353: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2354: bs = A->rmap->bs;
2355: aa = a->a;
2356: ai = a->i;
2357: aj = a->j;
2358: ambs = a->mbs;
2359: bs2 = a->bs2;
2361: VecSet(v,zero);
2362: VecGetArray(v,&x);
2363: VecGetLocalSize(v,&n);
2364: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2365: for (i=0; i<ambs; i++) {
2366: for (j=ai[i]; j<ai[i+1]; j++) {
2367: if (aj[j] == i) {
2368: row = i*bs;
2369: aa_j = aa+j*bs2;
2370: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2371: break;
2372: }
2373: }
2374: }
2375: VecRestoreArray(v,&x);
2376: return(0);
2377: }
2379: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2380: {
2381: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2382: const PetscScalar *l,*r,*li,*ri;
2383: PetscScalar x;
2384: MatScalar *aa, *v;
2385: PetscErrorCode ierr;
2386: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2387: const PetscInt *ai,*aj;
2390: ai = a->i;
2391: aj = a->j;
2392: aa = a->a;
2393: m = A->rmap->n;
2394: n = A->cmap->n;
2395: bs = A->rmap->bs;
2396: mbs = a->mbs;
2397: bs2 = a->bs2;
2398: if (ll) {
2399: VecGetArrayRead(ll,&l);
2400: VecGetLocalSize(ll,&lm);
2401: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2402: for (i=0; i<mbs; i++) { /* for each block row */
2403: M = ai[i+1] - ai[i];
2404: li = l + i*bs;
2405: v = aa + bs2*ai[i];
2406: for (j=0; j<M; j++) { /* for each block */
2407: for (k=0; k<bs2; k++) {
2408: (*v++) *= li[k%bs];
2409: }
2410: }
2411: }
2412: VecRestoreArrayRead(ll,&l);
2413: PetscLogFlops(a->nz);
2414: }
2416: if (rr) {
2417: VecGetArrayRead(rr,&r);
2418: VecGetLocalSize(rr,&rn);
2419: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2420: for (i=0; i<mbs; i++) { /* for each block row */
2421: iai = ai[i];
2422: M = ai[i+1] - iai;
2423: v = aa + bs2*iai;
2424: for (j=0; j<M; j++) { /* for each block */
2425: ri = r + bs*aj[iai+j];
2426: for (k=0; k<bs; k++) {
2427: x = ri[k];
2428: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2429: v += bs;
2430: }
2431: }
2432: }
2433: VecRestoreArrayRead(rr,&r);
2434: PetscLogFlops(a->nz);
2435: }
2436: return(0);
2437: }
2440: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2441: {
2442: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2445: info->block_size = a->bs2;
2446: info->nz_allocated = a->bs2*a->maxnz;
2447: info->nz_used = a->bs2*a->nz;
2448: info->nz_unneeded = info->nz_allocated - info->nz_used;
2449: info->assemblies = A->num_ass;
2450: info->mallocs = A->info.mallocs;
2451: info->memory = ((PetscObject)A)->mem;
2452: if (A->factortype) {
2453: info->fill_ratio_given = A->info.fill_ratio_given;
2454: info->fill_ratio_needed = A->info.fill_ratio_needed;
2455: info->factor_mallocs = A->info.factor_mallocs;
2456: } else {
2457: info->fill_ratio_given = 0;
2458: info->fill_ratio_needed = 0;
2459: info->factor_mallocs = 0;
2460: }
2461: return(0);
2462: }
2464: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2465: {
2466: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2470: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2471: return(0);
2472: }
2474: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2475: {
2479: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2481: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2482: return(0);
2483: }
2485: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2486: {
2487: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2488: PetscScalar *z = 0,sum1;
2489: const PetscScalar *xb;
2490: PetscScalar x1;
2491: const MatScalar *v,*vv;
2492: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2493: PetscBool usecprow=a->compressedrow.use;
2496: idx = a->j;
2497: v = a->a;
2498: if (usecprow) {
2499: mbs = a->compressedrow.nrows;
2500: ii = a->compressedrow.i;
2501: ridx = a->compressedrow.rindex;
2502: } else {
2503: mbs = a->mbs;
2504: ii = a->i;
2505: z = c;
2506: }
2508: for (i=0; i<mbs; i++) {
2509: n = ii[1] - ii[0]; ii++;
2510: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2511: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2512: if (usecprow) z = c + ridx[i];
2513: jj = idx;
2514: vv = v;
2515: for (k=0; k<cn; k++) {
2516: idx = jj;
2517: v = vv;
2518: sum1 = 0.0;
2519: for (j=0; j<n; j++) {
2520: xb = b + (*idx++); x1 = xb[0+k*bm];
2521: sum1 += v[0]*x1;
2522: v += 1;
2523: }
2524: z[0+k*cm] = sum1;
2525: }
2526: if (!usecprow) z += 1;
2527: }
2528: return(0);
2529: }
2531: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2532: {
2533: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2534: PetscScalar *z = 0,sum1,sum2;
2535: const PetscScalar *xb;
2536: PetscScalar x1,x2;
2537: const MatScalar *v,*vv;
2538: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2539: PetscBool usecprow=a->compressedrow.use;
2542: idx = a->j;
2543: v = a->a;
2544: if (usecprow) {
2545: mbs = a->compressedrow.nrows;
2546: ii = a->compressedrow.i;
2547: ridx = a->compressedrow.rindex;
2548: } else {
2549: mbs = a->mbs;
2550: ii = a->i;
2551: z = c;
2552: }
2554: for (i=0; i<mbs; i++) {
2555: n = ii[1] - ii[0]; ii++;
2556: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2557: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2558: if (usecprow) z = c + 2*ridx[i];
2559: jj = idx;
2560: vv = v;
2561: for (k=0; k<cn; k++) {
2562: idx = jj;
2563: v = vv;
2564: sum1 = 0.0; sum2 = 0.0;
2565: for (j=0; j<n; j++) {
2566: xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2567: sum1 += v[0]*x1 + v[2]*x2;
2568: sum2 += v[1]*x1 + v[3]*x2;
2569: v += 4;
2570: }
2571: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2572: }
2573: if (!usecprow) z += 2;
2574: }
2575: return(0);
2576: }
2578: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2579: {
2580: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2581: PetscScalar *z = 0,sum1,sum2,sum3;
2582: const PetscScalar *xb;
2583: PetscScalar x1,x2,x3;
2584: const MatScalar *v,*vv;
2585: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2586: PetscBool usecprow=a->compressedrow.use;
2589: idx = a->j;
2590: v = a->a;
2591: if (usecprow) {
2592: mbs = a->compressedrow.nrows;
2593: ii = a->compressedrow.i;
2594: ridx = a->compressedrow.rindex;
2595: } else {
2596: mbs = a->mbs;
2597: ii = a->i;
2598: z = c;
2599: }
2601: for (i=0; i<mbs; i++) {
2602: n = ii[1] - ii[0]; ii++;
2603: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2604: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2605: if (usecprow) z = c + 3*ridx[i];
2606: jj = idx;
2607: vv = v;
2608: for (k=0; k<cn; k++) {
2609: idx = jj;
2610: v = vv;
2611: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
2612: for (j=0; j<n; j++) {
2613: xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2614: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2615: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2616: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2617: v += 9;
2618: }
2619: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2620: }
2621: if (!usecprow) z += 3;
2622: }
2623: return(0);
2624: }
2626: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2627: {
2628: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2629: PetscScalar *z = 0,sum1,sum2,sum3,sum4;
2630: const PetscScalar *xb;
2631: PetscScalar x1,x2,x3,x4;
2632: const MatScalar *v,*vv;
2633: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2634: PetscBool usecprow=a->compressedrow.use;
2637: idx = a->j;
2638: v = a->a;
2639: if (usecprow) {
2640: mbs = a->compressedrow.nrows;
2641: ii = a->compressedrow.i;
2642: ridx = a->compressedrow.rindex;
2643: } else {
2644: mbs = a->mbs;
2645: ii = a->i;
2646: z = c;
2647: }
2649: for (i=0; i<mbs; i++) {
2650: n = ii[1] - ii[0]; ii++;
2651: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2652: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2653: if (usecprow) z = c + 4*ridx[i];
2654: jj = idx;
2655: vv = v;
2656: for (k=0; k<cn; k++) {
2657: idx = jj;
2658: v = vv;
2659: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2660: for (j=0; j<n; j++) {
2661: xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
2662: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2663: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2664: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2665: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2666: v += 16;
2667: }
2668: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2669: }
2670: if (!usecprow) z += 4;
2671: }
2672: return(0);
2673: }
2675: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2676: {
2677: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2678: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5;
2679: const PetscScalar *xb;
2680: PetscScalar x1,x2,x3,x4,x5;
2681: const MatScalar *v,*vv;
2682: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2683: PetscBool usecprow=a->compressedrow.use;
2686: idx = a->j;
2687: v = a->a;
2688: if (usecprow) {
2689: mbs = a->compressedrow.nrows;
2690: ii = a->compressedrow.i;
2691: ridx = a->compressedrow.rindex;
2692: } else {
2693: mbs = a->mbs;
2694: ii = a->i;
2695: z = c;
2696: }
2698: for (i=0; i<mbs; i++) {
2699: n = ii[1] - ii[0]; ii++;
2700: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2701: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2702: if (usecprow) z = c + 5*ridx[i];
2703: jj = idx;
2704: vv = v;
2705: for (k=0; k<cn; k++) {
2706: idx = jj;
2707: v = vv;
2708: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2709: for (j=0; j<n; j++) {
2710: xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
2711: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2712: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2713: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2714: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2715: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2716: v += 25;
2717: }
2718: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2719: }
2720: if (!usecprow) z += 5;
2721: }
2722: return(0);
2723: }
2725: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2726: {
2727: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2728: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
2729: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
2730: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2731: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2732: PetscBLASInt bbs,bcn,bbm,bcm;
2733: PetscScalar *z = 0;
2734: PetscScalar *c,*b;
2735: const MatScalar *v;
2736: const PetscInt *idx,*ii,*ridx=NULL;
2737: PetscScalar _DZero=0.0,_DOne=1.0;
2738: PetscBool usecprow=a->compressedrow.use;
2739: PetscErrorCode ierr;
2742: if (!cm || !cn) return(0);
2743: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2744: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2745: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2746: b = bd->v;
2747: if (a->nonzerorowcnt != A->rmap->n) {
2748: MatZeroEntries(C);
2749: }
2750: MatDenseGetArray(C,&c);
2751: switch (bs) {
2752: case 1:
2753: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2754: break;
2755: case 2:
2756: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2757: break;
2758: case 3:
2759: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2760: break;
2761: case 4:
2762: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2763: break;
2764: case 5:
2765: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2766: break;
2767: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2768: PetscBLASIntCast(bs,&bbs);
2769: PetscBLASIntCast(cn,&bcn);
2770: PetscBLASIntCast(bm,&bbm);
2771: PetscBLASIntCast(cm,&bcm);
2772: idx = a->j;
2773: v = a->a;
2774: if (usecprow) {
2775: mbs = a->compressedrow.nrows;
2776: ii = a->compressedrow.i;
2777: ridx = a->compressedrow.rindex;
2778: } else {
2779: mbs = a->mbs;
2780: ii = a->i;
2781: z = c;
2782: }
2783: for (i=0; i<mbs; i++) {
2784: n = ii[1] - ii[0]; ii++;
2785: if (usecprow) z = c + bs*ridx[i];
2786: if (n) {
2787: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2788: v += bs2;
2789: }
2790: for (j=1; j<n; j++) {
2791: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2792: v += bs2;
2793: }
2794: if (!usecprow) z += bs;
2795: }
2796: }
2797: MatDenseRestoreArray(C,&c);
2798: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2799: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2800: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
2801: return(0);
2802: }