Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
15: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
16: const PetscInt *idx;
17: PetscInt start,end,*ai,*aj,bs,*nidx2;
18: PetscBT table;
21: m = a->mbs;
22: ai = a->i;
23: aj = a->j;
24: bs = A->rmap->bs;
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28: PetscBTCreate(m,&table);
29: PetscMalloc1(m+1,&nidx);
30: PetscMalloc1(A->rmap->N+1,&nidx2);
32: for (i=0; i<is_max; i++) {
33: /* Initialise the two local arrays */
34: isz = 0;
35: PetscBTMemzero(m,table);
37: /* Extract the indices, assume there can be duplicate entries */
38: ISGetIndices(is[i],&idx);
39: ISGetLocalSize(is[i],&n);
41: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42: for (j=0; j<n; ++j) {
43: ival = idx[j]/bs; /* convert the indices into block indices */
44: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(&is[i]);
50: k = 0;
51: for (j=0; j<ov; j++) { /* for each overlap*/
52: n = isz;
53: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
54: row = nidx[k];
55: start = ai[row];
56: end = ai[row+1];
57: for (l = start; l<end; l++) {
58: val = aj[l];
59: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60: }
61: }
62: }
63: /* expand the Index Set */
64: for (j=0; j<isz; j++) {
65: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66: }
67: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
68: }
69: PetscBTDestroy(&table);
70: PetscFree(nidx);
71: PetscFree(nidx2);
72: return(0);
73: }
75: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
89: ISGetIndices(isrow,&irow);
90: ISGetIndices(iscol,&icol);
91: ISGetLocalSize(isrow,&nrows);
92: ISGetLocalSize(iscol,&ncols);
94: PetscCalloc1(1+oldcols,&smap);
95: ssmap = smap;
96: PetscMalloc1(1+nrows,&lens);
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) lens[i]++;
105: }
106: }
107: /* Create and fill new matrix */
108: if (scall == MAT_REUSE_MATRIX) {
109: c = (Mat_SeqBAIJ*)((*B)->data);
111: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
113: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114: PetscArrayzero(c->ilen,c->mbs);
115: C = *B;
116: } else {
117: MatCreate(PetscObjectComm((PetscObject)A),&C);
118: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119: MatSetType(C,((PetscObject)A)->type_name);
120: MatSeqBAIJSetPreallocation(C,bs,0,lens);
121: }
122: c = (Mat_SeqBAIJ*)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140: /* sort */
141: {
142: MatScalar *work;
143: PetscMalloc1(bs2,&work);
144: for (i=0; i<nrows; i++) {
145: PetscInt ilen;
146: mat_i = c->i[i];
147: mat_j = c->j + mat_i;
148: mat_a = c->a + mat_i*bs2;
149: ilen = c->ilen[i];
150: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151: }
152: PetscFree(work);
153: }
155: /* Free work space */
156: ISRestoreIndices(iscol,&icol);
157: PetscFree(smap);
158: PetscFree(lens);
159: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
162: ISRestoreIndices(isrow,&irow);
163: *B = C;
164: return(0);
165: }
167: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168: {
169: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
170: IS is1,is2;
172: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173: const PetscInt *irow,*icol;
176: ISGetIndices(isrow,&irow);
177: ISGetIndices(iscol,&icol);
178: ISGetLocalSize(isrow,&nrows);
179: ISGetLocalSize(iscol,&ncols);
181: /* Verify if the indices corespond to each element in a block
182: and form the IS with compressed IS */
183: maxmnbs = PetscMax(a->mbs,a->nbs);
184: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
185: PetscArrayzero(vary,a->mbs);
186: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187: for (i=0; i<a->mbs; i++) {
188: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189: }
190: count = 0;
191: for (i=0; i<nrows; i++) {
192: j = irow[i] / bs;
193: if ((vary[j]--)==bs) iary[count++] = j;
194: }
195: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
197: PetscArrayzero(vary,a->nbs);
198: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199: for (i=0; i<a->nbs; i++) {
200: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201: }
202: count = 0;
203: for (i=0; i<ncols; i++) {
204: j = icol[i] / bs;
205: if ((vary[j]--)==bs) iary[count++] = j;
206: }
207: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
208: ISRestoreIndices(isrow,&irow);
209: ISRestoreIndices(iscol,&icol);
210: PetscFree2(vary,iary);
212: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
213: ISDestroy(&is1);
214: ISDestroy(&is2);
215: return(0);
216: }
218: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219: {
221: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
222: Mat_SubSppt *submatj = c->submatis1;
225: (*submatj->destroy)(C);
226: MatDestroySubMatrix_Private(submatj);
227: return(0);
228: }
230: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231: {
233: PetscInt i;
234: Mat C;
235: Mat_SeqBAIJ *c;
236: Mat_SubSppt *submatj;
239: for (i=0; i<n; i++) {
240: C = (*mat)[i];
241: c = (Mat_SeqBAIJ*)C->data;
242: submatj = c->submatis1;
243: if (submatj) {
244: if (--((PetscObject)C)->refct <= 0) {
245: (*submatj->destroy)(C);
246: MatDestroySubMatrix_Private(submatj);
247: PetscFree(C->defaultvectype);
248: PetscLayoutDestroy(&C->rmap);
249: PetscLayoutDestroy(&C->cmap);
250: PetscHeaderDestroy(&C);
251: }
252: } else {
253: MatDestroy(&C);
254: }
255: }
257: /* Destroy Dummy submatrices created for reuse */
258: MatDestroySubMatrices_Dummy(n,mat);
260: PetscFree(*mat);
261: return(0);
262: }
264: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
265: {
267: PetscInt i;
270: if (scall == MAT_INITIAL_MATRIX) {
271: PetscCalloc1(n+1,B);
272: }
274: for (i=0; i<n; i++) {
275: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
276: }
277: return(0);
278: }
280: /* -------------------------------------------------------*/
281: /* Should check that shapes of vectors and matrices match */
282: /* -------------------------------------------------------*/
284: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
285: {
286: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
287: PetscScalar *z,sum;
288: const PetscScalar *x;
289: const MatScalar *v;
290: PetscErrorCode ierr;
291: PetscInt mbs,i,n;
292: const PetscInt *idx,*ii,*ridx=NULL;
293: PetscBool usecprow=a->compressedrow.use;
296: VecGetArrayRead(xx,&x);
297: VecGetArrayWrite(zz,&z);
299: if (usecprow) {
300: mbs = a->compressedrow.nrows;
301: ii = a->compressedrow.i;
302: ridx = a->compressedrow.rindex;
303: PetscArrayzero(z,a->mbs);
304: } else {
305: mbs = a->mbs;
306: ii = a->i;
307: }
309: for (i=0; i<mbs; i++) {
310: n = ii[1] - ii[0];
311: v = a->a + ii[0];
312: idx = a->j + ii[0];
313: ii++;
314: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
315: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
316: sum = 0.0;
317: PetscSparseDensePlusDot(sum,x,v,idx,n);
318: if (usecprow) {
319: z[ridx[i]] = sum;
320: } else {
321: z[i] = sum;
322: }
323: }
324: VecRestoreArrayRead(xx,&x);
325: VecRestoreArrayWrite(zz,&z);
326: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
327: return(0);
328: }
330: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
331: {
332: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
333: PetscScalar *z = NULL,sum1,sum2,*zarray;
334: const PetscScalar *x,*xb;
335: PetscScalar x1,x2;
336: const MatScalar *v;
337: PetscErrorCode ierr;
338: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
339: PetscBool usecprow=a->compressedrow.use;
342: VecGetArrayRead(xx,&x);
343: VecGetArrayWrite(zz,&zarray);
345: idx = a->j;
346: v = a->a;
347: if (usecprow) {
348: mbs = a->compressedrow.nrows;
349: ii = a->compressedrow.i;
350: ridx = a->compressedrow.rindex;
351: PetscArrayzero(zarray,2*a->mbs);
352: } else {
353: mbs = a->mbs;
354: ii = a->i;
355: z = zarray;
356: }
358: for (i=0; i<mbs; i++) {
359: n = ii[1] - ii[0]; ii++;
360: sum1 = 0.0; sum2 = 0.0;
361: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
362: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
363: for (j=0; j<n; j++) {
364: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
365: sum1 += v[0]*x1 + v[2]*x2;
366: sum2 += v[1]*x1 + v[3]*x2;
367: v += 4;
368: }
369: if (usecprow) z = zarray + 2*ridx[i];
370: z[0] = sum1; z[1] = sum2;
371: if (!usecprow) z += 2;
372: }
373: VecRestoreArrayRead(xx,&x);
374: VecRestoreArrayWrite(zz,&zarray);
375: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
376: return(0);
377: }
379: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
380: {
381: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
382: PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
383: const PetscScalar *x,*xb;
384: const MatScalar *v;
385: PetscErrorCode ierr;
386: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
387: PetscBool usecprow=a->compressedrow.use;
389: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
390: #pragma disjoint(*v,*z,*xb)
391: #endif
394: VecGetArrayRead(xx,&x);
395: VecGetArrayWrite(zz,&zarray);
397: idx = a->j;
398: v = a->a;
399: if (usecprow) {
400: mbs = a->compressedrow.nrows;
401: ii = a->compressedrow.i;
402: ridx = a->compressedrow.rindex;
403: PetscArrayzero(zarray,3*a->mbs);
404: } else {
405: mbs = a->mbs;
406: ii = a->i;
407: z = zarray;
408: }
410: for (i=0; i<mbs; i++) {
411: n = ii[1] - ii[0]; ii++;
412: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
413: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
414: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415: for (j=0; j<n; j++) {
416: xb = x + 3*(*idx++);
417: x1 = xb[0];
418: x2 = xb[1];
419: x3 = xb[2];
421: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
422: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
423: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
424: v += 9;
425: }
426: if (usecprow) z = zarray + 3*ridx[i];
427: z[0] = sum1; z[1] = sum2; z[2] = sum3;
428: if (!usecprow) z += 3;
429: }
430: VecRestoreArrayRead(xx,&x);
431: VecRestoreArrayWrite(zz,&zarray);
432: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
433: return(0);
434: }
436: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
437: {
438: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
439: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
440: const PetscScalar *x,*xb;
441: const MatScalar *v;
442: PetscErrorCode ierr;
443: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
444: PetscBool usecprow=a->compressedrow.use;
447: VecGetArrayRead(xx,&x);
448: VecGetArrayWrite(zz,&zarray);
450: idx = a->j;
451: v = a->a;
452: if (usecprow) {
453: mbs = a->compressedrow.nrows;
454: ii = a->compressedrow.i;
455: ridx = a->compressedrow.rindex;
456: PetscArrayzero(zarray,4*a->mbs);
457: } else {
458: mbs = a->mbs;
459: ii = a->i;
460: z = zarray;
461: }
463: for (i=0; i<mbs; i++) {
464: n = ii[1] - ii[0];
465: ii++;
466: sum1 = 0.0;
467: sum2 = 0.0;
468: sum3 = 0.0;
469: sum4 = 0.0;
471: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
472: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
473: for (j=0; j<n; j++) {
474: xb = x + 4*(*idx++);
475: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
476: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
477: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
478: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
479: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
480: v += 16;
481: }
482: if (usecprow) z = zarray + 4*ridx[i];
483: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
484: if (!usecprow) z += 4;
485: }
486: VecRestoreArrayRead(xx,&x);
487: VecRestoreArrayWrite(zz,&zarray);
488: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
489: return(0);
490: }
492: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
493: {
494: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
495: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
496: const PetscScalar *xb,*x;
497: const MatScalar *v;
498: PetscErrorCode ierr;
499: const PetscInt *idx,*ii,*ridx=NULL;
500: PetscInt mbs,i,j,n;
501: PetscBool usecprow=a->compressedrow.use;
504: VecGetArrayRead(xx,&x);
505: VecGetArrayWrite(zz,&zarray);
507: idx = a->j;
508: v = a->a;
509: if (usecprow) {
510: mbs = a->compressedrow.nrows;
511: ii = a->compressedrow.i;
512: ridx = a->compressedrow.rindex;
513: PetscArrayzero(zarray,5*a->mbs);
514: } else {
515: mbs = a->mbs;
516: ii = a->i;
517: z = zarray;
518: }
520: for (i=0; i<mbs; i++) {
521: n = ii[1] - ii[0]; ii++;
522: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
523: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
524: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
525: for (j=0; j<n; j++) {
526: xb = x + 5*(*idx++);
527: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
528: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
529: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
530: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
531: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
532: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
533: v += 25;
534: }
535: if (usecprow) z = zarray + 5*ridx[i];
536: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
537: if (!usecprow) z += 5;
538: }
539: VecRestoreArrayRead(xx,&x);
540: VecRestoreArrayWrite(zz,&zarray);
541: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
542: return(0);
543: }
545: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
546: {
547: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
548: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
549: const PetscScalar *x,*xb;
550: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
551: const MatScalar *v;
552: PetscErrorCode ierr;
553: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
554: PetscBool usecprow=a->compressedrow.use;
557: VecGetArrayRead(xx,&x);
558: VecGetArrayWrite(zz,&zarray);
560: idx = a->j;
561: v = a->a;
562: if (usecprow) {
563: mbs = a->compressedrow.nrows;
564: ii = a->compressedrow.i;
565: ridx = a->compressedrow.rindex;
566: PetscArrayzero(zarray,6*a->mbs);
567: } else {
568: mbs = a->mbs;
569: ii = a->i;
570: z = zarray;
571: }
573: for (i=0; i<mbs; i++) {
574: n = ii[1] - ii[0];
575: ii++;
576: sum1 = 0.0;
577: sum2 = 0.0;
578: sum3 = 0.0;
579: sum4 = 0.0;
580: sum5 = 0.0;
581: sum6 = 0.0;
583: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
584: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585: for (j=0; j<n; j++) {
586: xb = x + 6*(*idx++);
587: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
588: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
589: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
590: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
591: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
592: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
593: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
594: v += 36;
595: }
596: if (usecprow) z = zarray + 6*ridx[i];
597: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
598: if (!usecprow) z += 6;
599: }
601: VecRestoreArrayRead(xx,&x);
602: VecRestoreArrayWrite(zz,&zarray);
603: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
604: return(0);
605: }
607: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
608: {
609: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
610: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
611: const PetscScalar *x,*xb;
612: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
613: const MatScalar *v;
614: PetscErrorCode ierr;
615: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
616: PetscBool usecprow=a->compressedrow.use;
619: VecGetArrayRead(xx,&x);
620: VecGetArrayWrite(zz,&zarray);
622: idx = a->j;
623: v = a->a;
624: if (usecprow) {
625: mbs = a->compressedrow.nrows;
626: ii = a->compressedrow.i;
627: ridx = a->compressedrow.rindex;
628: PetscArrayzero(zarray,7*a->mbs);
629: } else {
630: mbs = a->mbs;
631: ii = a->i;
632: z = zarray;
633: }
635: for (i=0; i<mbs; i++) {
636: n = ii[1] - ii[0];
637: ii++;
638: sum1 = 0.0;
639: sum2 = 0.0;
640: sum3 = 0.0;
641: sum4 = 0.0;
642: sum5 = 0.0;
643: sum6 = 0.0;
644: sum7 = 0.0;
646: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
647: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
648: for (j=0; j<n; j++) {
649: xb = x + 7*(*idx++);
650: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
651: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
652: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
653: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
654: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
655: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
656: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
657: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
658: v += 49;
659: }
660: if (usecprow) z = zarray + 7*ridx[i];
661: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
662: if (!usecprow) z += 7;
663: }
665: VecRestoreArrayRead(xx,&x);
666: VecRestoreArrayWrite(zz,&zarray);
667: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
668: return(0);
669: }
671: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
672: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
673: {
674: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
675: PetscScalar *z = NULL,*work,*workt,*zarray;
676: const PetscScalar *x,*xb;
677: const MatScalar *v;
678: PetscErrorCode ierr;
679: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
680: const PetscInt *idx,*ii,*ridx=NULL;
681: PetscInt k;
682: PetscBool usecprow=a->compressedrow.use;
684: __m256d a0,a1,a2,a3,a4,a5;
685: __m256d w0,w1,w2,w3;
686: __m256d z0,z1,z2;
687: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
690: VecGetArrayRead(xx,&x);
691: VecGetArrayWrite(zz,&zarray);
693: idx = a->j;
694: v = a->a;
695: if (usecprow) {
696: mbs = a->compressedrow.nrows;
697: ii = a->compressedrow.i;
698: ridx = a->compressedrow.rindex;
699: PetscArrayzero(zarray,bs*a->mbs);
700: } else {
701: mbs = a->mbs;
702: ii = a->i;
703: z = zarray;
704: }
706: if (!a->mult_work) {
707: k = PetscMax(A->rmap->n,A->cmap->n);
708: PetscMalloc1(k+1,&a->mult_work);
709: }
711: work = a->mult_work;
712: for (i=0; i<mbs; i++) {
713: n = ii[1] - ii[0]; ii++;
714: workt = work;
715: for (j=0; j<n; j++) {
716: xb = x + bs*(*idx++);
717: for (k=0; k<bs; k++) workt[k] = xb[k];
718: workt += bs;
719: }
720: if (usecprow) z = zarray + bs*ridx[i];
722: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
724: for (j=0; j<n; j++) {
725: /* first column of a */
726: w0 = _mm256_set1_pd(work[j*9 ]);
727: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
728: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
729: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
731: /* second column of a */
732: w1 = _mm256_set1_pd(work[j*9+ 1]);
733: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
734: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
735: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
737: /* third column of a */
738: w2 = _mm256_set1_pd(work[j*9 +2]);
739: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
740: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
741: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
743: /* fourth column of a */
744: w3 = _mm256_set1_pd(work[j*9+ 3]);
745: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
746: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
747: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
749: /* fifth column of a */
750: w0 = _mm256_set1_pd(work[j*9+ 4]);
751: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
752: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
753: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
755: /* sixth column of a */
756: w1 = _mm256_set1_pd(work[j*9+ 5]);
757: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
758: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
759: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
761: /* seventh column of a */
762: w2 = _mm256_set1_pd(work[j*9+ 6]);
763: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
764: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
765: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
767: /* eigth column of a */
768: w3 = _mm256_set1_pd(work[j*9+ 7]);
769: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
770: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
771: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
773: /* ninth column of a */
774: w0 = _mm256_set1_pd(work[j*9+ 8]);
775: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
776: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
777: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
778: }
780: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
782: v += n*bs2;
783: if (!usecprow) z += bs;
784: }
785: VecRestoreArrayRead(xx,&x);
786: VecRestoreArrayWrite(zz,&zarray);
787: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
788: return(0);
789: }
790: #endif
792: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
793: {
794: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
795: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
796: const PetscScalar *x,*xb;
797: PetscScalar *zarray,xv;
798: const MatScalar *v;
799: PetscErrorCode ierr;
800: const PetscInt *ii,*ij=a->j,*idx;
801: PetscInt mbs,i,j,k,n,*ridx=NULL;
802: PetscBool usecprow=a->compressedrow.use;
805: VecGetArrayRead(xx,&x);
806: VecGetArrayWrite(zz,&zarray);
808: v = a->a;
809: if (usecprow) {
810: mbs = a->compressedrow.nrows;
811: ii = a->compressedrow.i;
812: ridx = a->compressedrow.rindex;
813: PetscArrayzero(zarray,11*a->mbs);
814: } else {
815: mbs = a->mbs;
816: ii = a->i;
817: z = zarray;
818: }
820: for (i=0; i<mbs; i++) {
821: n = ii[i+1] - ii[i];
822: idx = ij + ii[i];
823: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
824: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
826: for (j=0; j<n; j++) {
827: xb = x + 11*(idx[j]);
829: for (k=0; k<11; k++) {
830: xv = xb[k];
831: sum1 += v[0]*xv;
832: sum2 += v[1]*xv;
833: sum3 += v[2]*xv;
834: sum4 += v[3]*xv;
835: sum5 += v[4]*xv;
836: sum6 += v[5]*xv;
837: sum7 += v[6]*xv;
838: sum8 += v[7]*xv;
839: sum9 += v[8]*xv;
840: sum10 += v[9]*xv;
841: sum11 += v[10]*xv;
842: v += 11;
843: }
844: }
845: if (usecprow) z = zarray + 11*ridx[i];
846: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
847: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
849: if (!usecprow) z += 11;
850: }
852: VecRestoreArrayRead(xx,&x);
853: VecRestoreArrayWrite(zz,&zarray);
854: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
855: return(0);
856: }
858: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
859: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
860: {
861: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
862: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
863: const PetscScalar *x,*xb;
864: PetscScalar *zarray,xv;
865: const MatScalar *v;
866: PetscErrorCode ierr;
867: const PetscInt *ii,*ij=a->j,*idx;
868: PetscInt mbs,i,j,k,n,*ridx=NULL;
869: PetscBool usecprow=a->compressedrow.use;
872: VecGetArrayRead(xx,&x);
873: VecGetArrayWrite(zz,&zarray);
875: v = a->a;
876: if (usecprow) {
877: mbs = a->compressedrow.nrows;
878: ii = a->compressedrow.i;
879: ridx = a->compressedrow.rindex;
880: PetscArrayzero(zarray,12*a->mbs);
881: } else {
882: mbs = a->mbs;
883: ii = a->i;
884: z = zarray;
885: }
887: for (i=0; i<mbs; i++) {
888: n = ii[i+1] - ii[i];
889: idx = ij + ii[i];
890: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
891: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
893: for (j=0; j<n; j++) {
894: xb = x + 12*(idx[j]);
896: for (k=0; k<12; k++) {
897: xv = xb[k];
898: sum1 += v[0]*xv;
899: sum2 += v[1]*xv;
900: sum3 += v[2]*xv;
901: sum4 += v[3]*xv;
902: sum5 += v[4]*xv;
903: sum6 += v[5]*xv;
904: sum7 += v[6]*xv;
905: sum8 += v[7]*xv;
906: sum9 += v[8]*xv;
907: sum10 += v[9]*xv;
908: sum11 += v[10]*xv;
909: sum12 += v[11]*xv;
910: v += 12;
911: }
912: }
913: if (usecprow) z = zarray + 12*ridx[i];
914: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
916: if (!usecprow) z += 12;
917: }
918: VecRestoreArrayRead(xx,&x);
919: VecRestoreArrayWrite(zz,&zarray);
920: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
921: return(0);
922: }
924: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
925: {
926: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
927: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
928: const PetscScalar *x,*xb;
929: PetscScalar *zarray,*yarray,xv;
930: const MatScalar *v;
931: PetscErrorCode ierr;
932: const PetscInt *ii,*ij=a->j,*idx;
933: PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL;
934: PetscBool usecprow=a->compressedrow.use;
937: VecGetArrayRead(xx,&x);
938: VecGetArrayPair(yy,zz,&yarray,&zarray);
940: v = a->a;
941: if (usecprow) {
942: if (zz != yy) {
943: PetscArraycpy(zarray,yarray,12*mbs);
944: }
945: mbs = a->compressedrow.nrows;
946: ii = a->compressedrow.i;
947: ridx = a->compressedrow.rindex;
948: } else {
949: ii = a->i;
950: y = yarray;
951: z = zarray;
952: }
954: for (i=0; i<mbs; i++) {
955: n = ii[i+1] - ii[i];
956: idx = ij + ii[i];
958: if (usecprow) {
959: y = yarray + 12*ridx[i];
960: z = zarray + 12*ridx[i];
961: }
962: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
963: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
965: for (j=0; j<n; j++) {
966: xb = x + 12*(idx[j]);
968: for (k=0; k<12; k++) {
969: xv = xb[k];
970: sum1 += v[0]*xv;
971: sum2 += v[1]*xv;
972: sum3 += v[2]*xv;
973: sum4 += v[3]*xv;
974: sum5 += v[4]*xv;
975: sum6 += v[5]*xv;
976: sum7 += v[6]*xv;
977: sum8 += v[7]*xv;
978: sum9 += v[8]*xv;
979: sum10 += v[9]*xv;
980: sum11 += v[10]*xv;
981: sum12 += v[11]*xv;
982: v += 12;
983: }
984: }
986: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
987: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
988: if (!usecprow) {
989: y += 12;
990: z += 12;
991: }
992: }
993: VecRestoreArrayRead(xx,&x);
994: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
995: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
996: return(0);
997: }
999: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1000: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1001: {
1002: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1003: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1004: const PetscScalar *x,*xb;
1005: PetscScalar x1,x2,x3,x4,*zarray;
1006: const MatScalar *v;
1007: PetscErrorCode ierr;
1008: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1009: PetscInt mbs,i,j,n;
1010: PetscBool usecprow=a->compressedrow.use;
1013: VecGetArrayRead(xx,&x);
1014: VecGetArrayWrite(zz,&zarray);
1016: v = a->a;
1017: if (usecprow) {
1018: mbs = a->compressedrow.nrows;
1019: ii = a->compressedrow.i;
1020: ridx = a->compressedrow.rindex;
1021: PetscArrayzero(zarray,12*a->mbs);
1022: } else {
1023: mbs = a->mbs;
1024: ii = a->i;
1025: z = zarray;
1026: }
1028: for (i=0; i<mbs; i++) {
1029: n = ii[i+1] - ii[i];
1030: idx = ij + ii[i];
1032: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1033: for (j=0; j<n; j++) {
1034: xb = x + 12*(idx[j]);
1035: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1037: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1038: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1039: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1040: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1041: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1042: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1043: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1044: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1045: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1046: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1047: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1048: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1049: v += 48;
1051: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1053: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1054: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1055: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1056: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1057: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1058: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1059: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1060: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1061: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1062: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1063: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1064: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1065: v += 48;
1067: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1068: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1069: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1070: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1071: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1072: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1073: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1074: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1075: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1076: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1077: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1078: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1079: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1080: v += 48;
1082: }
1083: if (usecprow) z = zarray + 12*ridx[i];
1084: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1085: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1086: if (!usecprow) z += 12;
1087: }
1088: VecRestoreArrayRead(xx,&x);
1089: VecRestoreArrayWrite(zz,&zarray);
1090: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1091: return(0);
1092: }
1094: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1095: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1096: {
1097: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1098: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1099: const PetscScalar *x,*xb;
1100: PetscScalar x1,x2,x3,x4,*zarray,*yarray;
1101: const MatScalar *v;
1102: PetscErrorCode ierr;
1103: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1104: PetscInt mbs = a->mbs,i,j,n;
1105: PetscBool usecprow=a->compressedrow.use;
1108: VecGetArrayRead(xx,&x);
1109: VecGetArrayPair(yy,zz,&yarray,&zarray);
1111: v = a->a;
1112: if (usecprow) {
1113: if (zz != yy) {
1114: PetscArraycpy(zarray,yarray,12*mbs);
1115: }
1116: mbs = a->compressedrow.nrows;
1117: ii = a->compressedrow.i;
1118: ridx = a->compressedrow.rindex;
1119: } else {
1120: ii = a->i;
1121: y = yarray;
1122: z = zarray;
1123: }
1125: for (i=0; i<mbs; i++) {
1126: n = ii[i+1] - ii[i];
1127: idx = ij + ii[i];
1129: if (usecprow) {
1130: y = yarray + 12*ridx[i];
1131: z = zarray + 12*ridx[i];
1132: }
1133: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1134: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1136: for (j=0; j<n; j++) {
1137: xb = x + 12*(idx[j]);
1138: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1140: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1141: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1142: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1143: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1144: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1145: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1146: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1147: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1148: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1149: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1150: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1151: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1152: v += 48;
1154: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1156: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1157: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1158: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1159: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1160: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1161: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1162: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1163: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1164: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1165: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1166: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1167: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1168: v += 48;
1170: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1171: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1172: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1173: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1174: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1175: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1176: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1177: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1178: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1179: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1180: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1181: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1182: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1183: v += 48;
1185: }
1186: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1187: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1188: if (!usecprow) {
1189: y += 12;
1190: z += 12;
1191: }
1192: }
1193: VecRestoreArrayRead(xx,&x);
1194: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1195: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1196: return(0);
1197: }
1199: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1200: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1201: {
1202: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1203: PetscScalar *z = NULL,*zarray;
1204: const PetscScalar *x,*work;
1205: const MatScalar *v = a->a;
1206: PetscErrorCode ierr;
1207: PetscInt mbs,i,j,n;
1208: const PetscInt *idx = a->j,*ii,*ridx=NULL;
1209: PetscBool usecprow=a->compressedrow.use;
1210: const PetscInt bs = 12, bs2 = 144;
1212: __m256d a0,a1,a2,a3,a4,a5;
1213: __m256d w0,w1,w2,w3;
1214: __m256d z0,z1,z2;
1217: VecGetArrayRead(xx,&x);
1218: VecGetArrayWrite(zz,&zarray);
1220: if (usecprow) {
1221: mbs = a->compressedrow.nrows;
1222: ii = a->compressedrow.i;
1223: ridx = a->compressedrow.rindex;
1224: PetscArrayzero(zarray,bs*a->mbs);
1225: } else {
1226: mbs = a->mbs;
1227: ii = a->i;
1228: z = zarray;
1229: }
1231: for (i=0; i<mbs; i++) {
1232: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1234: n = ii[1] - ii[0]; ii++;
1235: for (j=0; j<n; j++) {
1236: work = x + bs*(*idx++);
1238: /* first column of a */
1239: w0 = _mm256_set1_pd(work[0]);
1240: a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1241: a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1242: a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1244: /* second column of a */
1245: w1 = _mm256_set1_pd(work[1]);
1246: a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1247: a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1248: a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1250: /* third column of a */
1251: w2 = _mm256_set1_pd(work[2]);
1252: a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1253: a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1254: a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1256: /* fourth column of a */
1257: w3 = _mm256_set1_pd(work[3]);
1258: a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1259: a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1260: a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1262: /* fifth column of a */
1263: w0 = _mm256_set1_pd(work[4]);
1264: a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1265: a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1266: a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1268: /* sixth column of a */
1269: w1 = _mm256_set1_pd(work[5]);
1270: a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1271: a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1272: a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1274: /* seventh column of a */
1275: w2 = _mm256_set1_pd(work[6]);
1276: a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1277: a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1278: a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1280: /* eigth column of a */
1281: w3 = _mm256_set1_pd(work[7]);
1282: a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1283: a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1284: a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1286: /* ninth column of a */
1287: w0 = _mm256_set1_pd(work[8]);
1288: a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1289: a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1290: a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1292: /* tenth column of a */
1293: w1 = _mm256_set1_pd(work[9]);
1294: a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1295: a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1296: a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1298: /* eleventh column of a */
1299: w2 = _mm256_set1_pd(work[10]);
1300: a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1301: a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1302: a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1304: /* twelveth column of a */
1305: w3 = _mm256_set1_pd(work[11]);
1306: a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1307: a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1308: a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1310: v += bs2;
1311: }
1312: if (usecprow) z = zarray + bs*ridx[i];
1313: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1314: if (!usecprow) z += bs;
1315: }
1316: VecRestoreArrayRead(xx,&x);
1317: VecRestoreArrayWrite(zz,&zarray);
1318: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1319: return(0);
1320: }
1321: #endif
1323: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1324: /* Default MatMult for block size 15 */
1325: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1326: {
1327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1328: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1329: const PetscScalar *x,*xb;
1330: PetscScalar *zarray,xv;
1331: const MatScalar *v;
1332: PetscErrorCode ierr;
1333: const PetscInt *ii,*ij=a->j,*idx;
1334: PetscInt mbs,i,j,k,n,*ridx=NULL;
1335: PetscBool usecprow=a->compressedrow.use;
1338: VecGetArrayRead(xx,&x);
1339: VecGetArrayWrite(zz,&zarray);
1341: v = a->a;
1342: if (usecprow) {
1343: mbs = a->compressedrow.nrows;
1344: ii = a->compressedrow.i;
1345: ridx = a->compressedrow.rindex;
1346: PetscArrayzero(zarray,15*a->mbs);
1347: } else {
1348: mbs = a->mbs;
1349: ii = a->i;
1350: z = zarray;
1351: }
1353: for (i=0; i<mbs; i++) {
1354: n = ii[i+1] - ii[i];
1355: idx = ij + ii[i];
1356: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1357: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1359: for (j=0; j<n; j++) {
1360: xb = x + 15*(idx[j]);
1362: for (k=0; k<15; k++) {
1363: xv = xb[k];
1364: sum1 += v[0]*xv;
1365: sum2 += v[1]*xv;
1366: sum3 += v[2]*xv;
1367: sum4 += v[3]*xv;
1368: sum5 += v[4]*xv;
1369: sum6 += v[5]*xv;
1370: sum7 += v[6]*xv;
1371: sum8 += v[7]*xv;
1372: sum9 += v[8]*xv;
1373: sum10 += v[9]*xv;
1374: sum11 += v[10]*xv;
1375: sum12 += v[11]*xv;
1376: sum13 += v[12]*xv;
1377: sum14 += v[13]*xv;
1378: sum15 += v[14]*xv;
1379: v += 15;
1380: }
1381: }
1382: if (usecprow) z = zarray + 15*ridx[i];
1383: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1384: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1386: if (!usecprow) z += 15;
1387: }
1389: VecRestoreArrayRead(xx,&x);
1390: VecRestoreArrayWrite(zz,&zarray);
1391: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1392: return(0);
1393: }
1395: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1396: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1397: {
1398: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1399: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1400: const PetscScalar *x,*xb;
1401: PetscScalar x1,x2,x3,x4,*zarray;
1402: const MatScalar *v;
1403: PetscErrorCode ierr;
1404: const PetscInt *ii,*ij=a->j,*idx;
1405: PetscInt mbs,i,j,n,*ridx=NULL;
1406: PetscBool usecprow=a->compressedrow.use;
1409: VecGetArrayRead(xx,&x);
1410: VecGetArrayWrite(zz,&zarray);
1412: v = a->a;
1413: if (usecprow) {
1414: mbs = a->compressedrow.nrows;
1415: ii = a->compressedrow.i;
1416: ridx = a->compressedrow.rindex;
1417: PetscArrayzero(zarray,15*a->mbs);
1418: } else {
1419: mbs = a->mbs;
1420: ii = a->i;
1421: z = zarray;
1422: }
1424: for (i=0; i<mbs; i++) {
1425: n = ii[i+1] - ii[i];
1426: idx = ij + ii[i];
1427: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1428: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1430: for (j=0; j<n; j++) {
1431: xb = x + 15*(idx[j]);
1432: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1434: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1435: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1436: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1437: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1438: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1439: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1440: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1441: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1442: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1443: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1444: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1445: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1446: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1447: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1448: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1450: v += 60;
1452: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1454: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1455: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1456: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1457: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1458: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1459: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1460: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1461: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1462: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1463: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1464: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1465: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1466: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1467: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1468: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1469: v += 60;
1471: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1472: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1473: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1474: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1475: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1476: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1477: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1478: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1479: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1480: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1481: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1482: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1483: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1484: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1485: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1486: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1487: v += 60;
1489: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1490: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1491: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1492: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1493: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1494: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1495: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1496: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1497: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1498: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1499: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1500: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1501: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1502: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1503: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1504: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1505: v += 45;
1506: }
1507: if (usecprow) z = zarray + 15*ridx[i];
1508: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1509: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1511: if (!usecprow) z += 15;
1512: }
1514: VecRestoreArrayRead(xx,&x);
1515: VecRestoreArrayWrite(zz,&zarray);
1516: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1517: return(0);
1518: }
1520: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1521: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1522: {
1523: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1524: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1525: const PetscScalar *x,*xb;
1526: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1527: const MatScalar *v;
1528: PetscErrorCode ierr;
1529: const PetscInt *ii,*ij=a->j,*idx;
1530: PetscInt mbs,i,j,n,*ridx=NULL;
1531: PetscBool usecprow=a->compressedrow.use;
1534: VecGetArrayRead(xx,&x);
1535: VecGetArrayWrite(zz,&zarray);
1537: v = a->a;
1538: if (usecprow) {
1539: mbs = a->compressedrow.nrows;
1540: ii = a->compressedrow.i;
1541: ridx = a->compressedrow.rindex;
1542: PetscArrayzero(zarray,15*a->mbs);
1543: } else {
1544: mbs = a->mbs;
1545: ii = a->i;
1546: z = zarray;
1547: }
1549: for (i=0; i<mbs; i++) {
1550: n = ii[i+1] - ii[i];
1551: idx = ij + ii[i];
1552: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1553: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1555: for (j=0; j<n; j++) {
1556: xb = x + 15*(idx[j]);
1557: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1558: x8 = xb[7];
1560: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1561: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1562: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1563: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1564: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1565: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1566: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1567: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1568: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1569: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1570: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1571: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1572: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1573: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1574: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1575: v += 120;
1577: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1579: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1580: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1581: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1582: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1583: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1584: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1585: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1586: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1587: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1588: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1589: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1590: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1591: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1592: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1593: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1594: v += 105;
1595: }
1596: if (usecprow) z = zarray + 15*ridx[i];
1597: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1598: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1600: if (!usecprow) z += 15;
1601: }
1603: VecRestoreArrayRead(xx,&x);
1604: VecRestoreArrayWrite(zz,&zarray);
1605: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1606: return(0);
1607: }
1609: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1610: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1611: {
1612: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1613: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1614: const PetscScalar *x,*xb;
1615: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1616: const MatScalar *v;
1617: PetscErrorCode ierr;
1618: const PetscInt *ii,*ij=a->j,*idx;
1619: PetscInt mbs,i,j,n,*ridx=NULL;
1620: PetscBool usecprow=a->compressedrow.use;
1623: VecGetArrayRead(xx,&x);
1624: VecGetArrayWrite(zz,&zarray);
1626: v = a->a;
1627: if (usecprow) {
1628: mbs = a->compressedrow.nrows;
1629: ii = a->compressedrow.i;
1630: ridx = a->compressedrow.rindex;
1631: PetscArrayzero(zarray,15*a->mbs);
1632: } else {
1633: mbs = a->mbs;
1634: ii = a->i;
1635: z = zarray;
1636: }
1638: for (i=0; i<mbs; i++) {
1639: n = ii[i+1] - ii[i];
1640: idx = ij + ii[i];
1641: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1642: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1644: for (j=0; j<n; j++) {
1645: xb = x + 15*(idx[j]);
1646: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1647: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1649: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1650: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1651: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1652: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1653: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1654: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1655: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1656: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1657: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1658: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1659: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1660: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1661: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1662: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1663: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1664: v += 225;
1665: }
1666: if (usecprow) z = zarray + 15*ridx[i];
1667: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1668: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1670: if (!usecprow) z += 15;
1671: }
1673: VecRestoreArrayRead(xx,&x);
1674: VecRestoreArrayWrite(zz,&zarray);
1675: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1676: return(0);
1677: }
1679: /*
1680: This will not work with MatScalar == float because it calls the BLAS
1681: */
1682: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1683: {
1684: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1685: PetscScalar *z = NULL,*work,*workt,*zarray;
1686: const PetscScalar *x,*xb;
1687: const MatScalar *v;
1688: PetscErrorCode ierr;
1689: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1690: const PetscInt *idx,*ii,*ridx=NULL;
1691: PetscInt ncols,k;
1692: PetscBool usecprow=a->compressedrow.use;
1695: VecGetArrayRead(xx,&x);
1696: VecGetArrayWrite(zz,&zarray);
1698: idx = a->j;
1699: v = a->a;
1700: if (usecprow) {
1701: mbs = a->compressedrow.nrows;
1702: ii = a->compressedrow.i;
1703: ridx = a->compressedrow.rindex;
1704: PetscArrayzero(zarray,bs*a->mbs);
1705: } else {
1706: mbs = a->mbs;
1707: ii = a->i;
1708: z = zarray;
1709: }
1711: if (!a->mult_work) {
1712: k = PetscMax(A->rmap->n,A->cmap->n);
1713: PetscMalloc1(k+1,&a->mult_work);
1714: }
1715: work = a->mult_work;
1716: for (i=0; i<mbs; i++) {
1717: n = ii[1] - ii[0]; ii++;
1718: ncols = n*bs;
1719: workt = work;
1720: for (j=0; j<n; j++) {
1721: xb = x + bs*(*idx++);
1722: for (k=0; k<bs; k++) workt[k] = xb[k];
1723: workt += bs;
1724: }
1725: if (usecprow) z = zarray + bs*ridx[i];
1726: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1727: v += n*bs2;
1728: if (!usecprow) z += bs;
1729: }
1730: VecRestoreArrayRead(xx,&x);
1731: VecRestoreArrayWrite(zz,&zarray);
1732: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1733: return(0);
1734: }
1736: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1737: {
1738: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1739: const PetscScalar *x;
1740: PetscScalar *y,*z,sum;
1741: const MatScalar *v;
1742: PetscErrorCode ierr;
1743: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1744: const PetscInt *idx,*ii;
1745: PetscBool usecprow=a->compressedrow.use;
1748: VecGetArrayRead(xx,&x);
1749: VecGetArrayPair(yy,zz,&y,&z);
1751: idx = a->j;
1752: v = a->a;
1753: if (usecprow) {
1754: if (zz != yy) {
1755: PetscArraycpy(z,y,mbs);
1756: }
1757: mbs = a->compressedrow.nrows;
1758: ii = a->compressedrow.i;
1759: ridx = a->compressedrow.rindex;
1760: } else {
1761: ii = a->i;
1762: }
1764: for (i=0; i<mbs; i++) {
1765: n = ii[1] - ii[0];
1766: ii++;
1767: if (!usecprow) {
1768: sum = y[i];
1769: } else {
1770: sum = y[ridx[i]];
1771: }
1772: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1773: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1774: PetscSparseDensePlusDot(sum,x,v,idx,n);
1775: v += n;
1776: idx += n;
1777: if (usecprow) {
1778: z[ridx[i]] = sum;
1779: } else {
1780: z[i] = sum;
1781: }
1782: }
1783: VecRestoreArrayRead(xx,&x);
1784: VecRestoreArrayPair(yy,zz,&y,&z);
1785: PetscLogFlops(2.0*a->nz);
1786: return(0);
1787: }
1789: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1790: {
1791: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1792: PetscScalar *y = NULL,*z = NULL,sum1,sum2;
1793: const PetscScalar *x,*xb;
1794: PetscScalar x1,x2,*yarray,*zarray;
1795: const MatScalar *v;
1796: PetscErrorCode ierr;
1797: PetscInt mbs = a->mbs,i,n,j;
1798: const PetscInt *idx,*ii,*ridx = NULL;
1799: PetscBool usecprow = a->compressedrow.use;
1802: VecGetArrayRead(xx,&x);
1803: VecGetArrayPair(yy,zz,&yarray,&zarray);
1805: idx = a->j;
1806: v = a->a;
1807: if (usecprow) {
1808: if (zz != yy) {
1809: PetscArraycpy(zarray,yarray,2*mbs);
1810: }
1811: mbs = a->compressedrow.nrows;
1812: ii = a->compressedrow.i;
1813: ridx = a->compressedrow.rindex;
1814: } else {
1815: ii = a->i;
1816: y = yarray;
1817: z = zarray;
1818: }
1820: for (i=0; i<mbs; i++) {
1821: n = ii[1] - ii[0]; ii++;
1822: if (usecprow) {
1823: z = zarray + 2*ridx[i];
1824: y = yarray + 2*ridx[i];
1825: }
1826: sum1 = y[0]; sum2 = y[1];
1827: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1828: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1829: for (j=0; j<n; j++) {
1830: xb = x + 2*(*idx++);
1831: x1 = xb[0];
1832: x2 = xb[1];
1834: sum1 += v[0]*x1 + v[2]*x2;
1835: sum2 += v[1]*x1 + v[3]*x2;
1836: v += 4;
1837: }
1838: z[0] = sum1; z[1] = sum2;
1839: if (!usecprow) {
1840: z += 2; y += 2;
1841: }
1842: }
1843: VecRestoreArrayRead(xx,&x);
1844: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1845: PetscLogFlops(4.0*a->nz);
1846: return(0);
1847: }
1849: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1850: {
1851: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1852: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1853: const PetscScalar *x,*xb;
1854: const MatScalar *v;
1855: PetscErrorCode ierr;
1856: PetscInt mbs = a->mbs,i,j,n;
1857: const PetscInt *idx,*ii,*ridx = NULL;
1858: PetscBool usecprow = a->compressedrow.use;
1861: VecGetArrayRead(xx,&x);
1862: VecGetArrayPair(yy,zz,&yarray,&zarray);
1864: idx = a->j;
1865: v = a->a;
1866: if (usecprow) {
1867: if (zz != yy) {
1868: PetscArraycpy(zarray,yarray,3*mbs);
1869: }
1870: mbs = a->compressedrow.nrows;
1871: ii = a->compressedrow.i;
1872: ridx = a->compressedrow.rindex;
1873: } else {
1874: ii = a->i;
1875: y = yarray;
1876: z = zarray;
1877: }
1879: for (i=0; i<mbs; i++) {
1880: n = ii[1] - ii[0]; ii++;
1881: if (usecprow) {
1882: z = zarray + 3*ridx[i];
1883: y = yarray + 3*ridx[i];
1884: }
1885: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1886: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1887: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1888: for (j=0; j<n; j++) {
1889: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1890: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1891: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1892: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1893: v += 9;
1894: }
1895: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1896: if (!usecprow) {
1897: z += 3; y += 3;
1898: }
1899: }
1900: VecRestoreArrayRead(xx,&x);
1901: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1902: PetscLogFlops(18.0*a->nz);
1903: return(0);
1904: }
1906: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1907: {
1908: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1909: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1910: const PetscScalar *x,*xb;
1911: const MatScalar *v;
1912: PetscErrorCode ierr;
1913: PetscInt mbs = a->mbs,i,j,n;
1914: const PetscInt *idx,*ii,*ridx=NULL;
1915: PetscBool usecprow=a->compressedrow.use;
1918: VecGetArrayRead(xx,&x);
1919: VecGetArrayPair(yy,zz,&yarray,&zarray);
1921: idx = a->j;
1922: v = a->a;
1923: if (usecprow) {
1924: if (zz != yy) {
1925: PetscArraycpy(zarray,yarray,4*mbs);
1926: }
1927: mbs = a->compressedrow.nrows;
1928: ii = a->compressedrow.i;
1929: ridx = a->compressedrow.rindex;
1930: } else {
1931: ii = a->i;
1932: y = yarray;
1933: z = zarray;
1934: }
1936: for (i=0; i<mbs; i++) {
1937: n = ii[1] - ii[0]; ii++;
1938: if (usecprow) {
1939: z = zarray + 4*ridx[i];
1940: y = yarray + 4*ridx[i];
1941: }
1942: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1943: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1944: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1945: for (j=0; j<n; j++) {
1946: xb = x + 4*(*idx++);
1947: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1948: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1949: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1950: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1951: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1952: v += 16;
1953: }
1954: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1955: if (!usecprow) {
1956: z += 4; y += 4;
1957: }
1958: }
1959: VecRestoreArrayRead(xx,&x);
1960: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1961: PetscLogFlops(32.0*a->nz);
1962: return(0);
1963: }
1965: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1966: {
1967: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1968: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1969: const PetscScalar *x,*xb;
1970: PetscScalar *yarray,*zarray;
1971: const MatScalar *v;
1972: PetscErrorCode ierr;
1973: PetscInt mbs = a->mbs,i,j,n;
1974: const PetscInt *idx,*ii,*ridx = NULL;
1975: PetscBool usecprow=a->compressedrow.use;
1978: VecGetArrayRead(xx,&x);
1979: VecGetArrayPair(yy,zz,&yarray,&zarray);
1981: idx = a->j;
1982: v = a->a;
1983: if (usecprow) {
1984: if (zz != yy) {
1985: PetscArraycpy(zarray,yarray,5*mbs);
1986: }
1987: mbs = a->compressedrow.nrows;
1988: ii = a->compressedrow.i;
1989: ridx = a->compressedrow.rindex;
1990: } else {
1991: ii = a->i;
1992: y = yarray;
1993: z = zarray;
1994: }
1996: for (i=0; i<mbs; i++) {
1997: n = ii[1] - ii[0]; ii++;
1998: if (usecprow) {
1999: z = zarray + 5*ridx[i];
2000: y = yarray + 5*ridx[i];
2001: }
2002: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2003: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2004: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2005: for (j=0; j<n; j++) {
2006: xb = x + 5*(*idx++);
2007: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2008: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2009: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2010: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2011: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2012: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2013: v += 25;
2014: }
2015: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2016: if (!usecprow) {
2017: z += 5; y += 5;
2018: }
2019: }
2020: VecRestoreArrayRead(xx,&x);
2021: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2022: PetscLogFlops(50.0*a->nz);
2023: return(0);
2024: }
2026: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2027: {
2028: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2029: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2030: const PetscScalar *x,*xb;
2031: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2032: const MatScalar *v;
2033: PetscErrorCode ierr;
2034: PetscInt mbs = a->mbs,i,j,n;
2035: const PetscInt *idx,*ii,*ridx=NULL;
2036: PetscBool usecprow=a->compressedrow.use;
2039: VecGetArrayRead(xx,&x);
2040: VecGetArrayPair(yy,zz,&yarray,&zarray);
2042: idx = a->j;
2043: v = a->a;
2044: if (usecprow) {
2045: if (zz != yy) {
2046: PetscArraycpy(zarray,yarray,6*mbs);
2047: }
2048: mbs = a->compressedrow.nrows;
2049: ii = a->compressedrow.i;
2050: ridx = a->compressedrow.rindex;
2051: } else {
2052: ii = a->i;
2053: y = yarray;
2054: z = zarray;
2055: }
2057: for (i=0; i<mbs; i++) {
2058: n = ii[1] - ii[0]; ii++;
2059: if (usecprow) {
2060: z = zarray + 6*ridx[i];
2061: y = yarray + 6*ridx[i];
2062: }
2063: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2064: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2065: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2066: for (j=0; j<n; j++) {
2067: xb = x + 6*(*idx++);
2068: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2069: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2070: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2071: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2072: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2073: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2074: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2075: v += 36;
2076: }
2077: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2078: if (!usecprow) {
2079: z += 6; y += 6;
2080: }
2081: }
2082: VecRestoreArrayRead(xx,&x);
2083: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2084: PetscLogFlops(72.0*a->nz);
2085: return(0);
2086: }
2088: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2089: {
2090: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2091: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2092: const PetscScalar *x,*xb;
2093: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2094: const MatScalar *v;
2095: PetscErrorCode ierr;
2096: PetscInt mbs = a->mbs,i,j,n;
2097: const PetscInt *idx,*ii,*ridx = NULL;
2098: PetscBool usecprow=a->compressedrow.use;
2101: VecGetArrayRead(xx,&x);
2102: VecGetArrayPair(yy,zz,&yarray,&zarray);
2104: idx = a->j;
2105: v = a->a;
2106: if (usecprow) {
2107: if (zz != yy) {
2108: PetscArraycpy(zarray,yarray,7*mbs);
2109: }
2110: mbs = a->compressedrow.nrows;
2111: ii = a->compressedrow.i;
2112: ridx = a->compressedrow.rindex;
2113: } else {
2114: ii = a->i;
2115: y = yarray;
2116: z = zarray;
2117: }
2119: for (i=0; i<mbs; i++) {
2120: n = ii[1] - ii[0]; ii++;
2121: if (usecprow) {
2122: z = zarray + 7*ridx[i];
2123: y = yarray + 7*ridx[i];
2124: }
2125: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2126: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2127: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2128: for (j=0; j<n; j++) {
2129: xb = x + 7*(*idx++);
2130: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2131: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2132: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2133: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2134: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2135: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2136: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2137: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2138: v += 49;
2139: }
2140: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2141: if (!usecprow) {
2142: z += 7; y += 7;
2143: }
2144: }
2145: VecRestoreArrayRead(xx,&x);
2146: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2147: PetscLogFlops(98.0*a->nz);
2148: return(0);
2149: }
2151: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2152: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2153: {
2154: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2155: PetscScalar *z = NULL,*work,*workt,*zarray;
2156: const PetscScalar *x,*xb;
2157: const MatScalar *v;
2158: PetscErrorCode ierr;
2159: PetscInt mbs,i,j,n;
2160: PetscInt k;
2161: PetscBool usecprow=a->compressedrow.use;
2162: const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2164: __m256d a0,a1,a2,a3,a4,a5;
2165: __m256d w0,w1,w2,w3;
2166: __m256d z0,z1,z2;
2167: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2170: VecCopy(yy,zz);
2171: VecGetArrayRead(xx,&x);
2172: VecGetArray(zz,&zarray);
2174: idx = a->j;
2175: v = a->a;
2176: if (usecprow) {
2177: mbs = a->compressedrow.nrows;
2178: ii = a->compressedrow.i;
2179: ridx = a->compressedrow.rindex;
2180: } else {
2181: mbs = a->mbs;
2182: ii = a->i;
2183: z = zarray;
2184: }
2186: if (!a->mult_work) {
2187: k = PetscMax(A->rmap->n,A->cmap->n);
2188: PetscMalloc1(k+1,&a->mult_work);
2189: }
2191: work = a->mult_work;
2192: for (i=0; i<mbs; i++) {
2193: n = ii[1] - ii[0]; ii++;
2194: workt = work;
2195: for (j=0; j<n; j++) {
2196: xb = x + bs*(*idx++);
2197: for (k=0; k<bs; k++) workt[k] = xb[k];
2198: workt += bs;
2199: }
2200: if (usecprow) z = zarray + bs*ridx[i];
2202: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2204: for (j=0; j<n; j++) {
2205: /* first column of a */
2206: w0 = _mm256_set1_pd(work[j*9 ]);
2207: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2208: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2209: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2211: /* second column of a */
2212: w1 = _mm256_set1_pd(work[j*9+ 1]);
2213: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2214: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2215: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2217: /* third column of a */
2218: w2 = _mm256_set1_pd(work[j*9+ 2]);
2219: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2220: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2221: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2223: /* fourth column of a */
2224: w3 = _mm256_set1_pd(work[j*9+ 3]);
2225: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2226: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2227: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2229: /* fifth column of a */
2230: w0 = _mm256_set1_pd(work[j*9+ 4]);
2231: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2232: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2233: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2235: /* sixth column of a */
2236: w1 = _mm256_set1_pd(work[j*9+ 5]);
2237: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2238: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2239: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2241: /* seventh column of a */
2242: w2 = _mm256_set1_pd(work[j*9+ 6]);
2243: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2244: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2245: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2247: /* eigth column of a */
2248: w3 = _mm256_set1_pd(work[j*9+ 7]);
2249: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2250: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2251: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2253: /* ninth column of a */
2254: w0 = _mm256_set1_pd(work[j*9+ 8]);
2255: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2256: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2257: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2258: }
2260: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2262: v += n*bs2;
2263: if (!usecprow) z += bs;
2264: }
2265: VecRestoreArrayRead(xx,&x);
2266: VecRestoreArray(zz,&zarray);
2267: PetscLogFlops(162.0*a->nz);
2268: return(0);
2269: }
2270: #endif
2272: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2273: {
2274: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2275: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2276: const PetscScalar *x,*xb;
2277: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2278: const MatScalar *v;
2279: PetscErrorCode ierr;
2280: PetscInt mbs = a->mbs,i,j,n;
2281: const PetscInt *idx,*ii,*ridx = NULL;
2282: PetscBool usecprow=a->compressedrow.use;
2285: VecGetArrayRead(xx,&x);
2286: VecGetArrayPair(yy,zz,&yarray,&zarray);
2288: idx = a->j;
2289: v = a->a;
2290: if (usecprow) {
2291: if (zz != yy) {
2292: PetscArraycpy(zarray,yarray,7*mbs);
2293: }
2294: mbs = a->compressedrow.nrows;
2295: ii = a->compressedrow.i;
2296: ridx = a->compressedrow.rindex;
2297: } else {
2298: ii = a->i;
2299: y = yarray;
2300: z = zarray;
2301: }
2303: for (i=0; i<mbs; i++) {
2304: n = ii[1] - ii[0]; ii++;
2305: if (usecprow) {
2306: z = zarray + 11*ridx[i];
2307: y = yarray + 11*ridx[i];
2308: }
2309: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2310: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2311: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2312: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2313: for (j=0; j<n; j++) {
2314: xb = x + 11*(*idx++);
2315: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2316: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
2317: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
2318: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
2319: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
2320: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
2321: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
2322: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
2323: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
2324: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
2325: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
2326: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
2327: v += 121;
2328: }
2329: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2330: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2331: if (!usecprow) {
2332: z += 11; y += 11;
2333: }
2334: }
2335: VecRestoreArrayRead(xx,&x);
2336: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2337: PetscLogFlops(242.0*a->nz);
2338: return(0);
2339: }
2341: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2342: {
2343: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2344: PetscScalar *z = NULL,*work,*workt,*zarray;
2345: const PetscScalar *x,*xb;
2346: const MatScalar *v;
2347: PetscErrorCode ierr;
2348: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2349: PetscInt ncols,k;
2350: const PetscInt *ridx = NULL,*idx,*ii;
2351: PetscBool usecprow = a->compressedrow.use;
2354: VecCopy(yy,zz);
2355: VecGetArrayRead(xx,&x);
2356: VecGetArray(zz,&zarray);
2358: idx = a->j;
2359: v = a->a;
2360: if (usecprow) {
2361: mbs = a->compressedrow.nrows;
2362: ii = a->compressedrow.i;
2363: ridx = a->compressedrow.rindex;
2364: } else {
2365: mbs = a->mbs;
2366: ii = a->i;
2367: z = zarray;
2368: }
2370: if (!a->mult_work) {
2371: k = PetscMax(A->rmap->n,A->cmap->n);
2372: PetscMalloc1(k+1,&a->mult_work);
2373: }
2374: work = a->mult_work;
2375: for (i=0; i<mbs; i++) {
2376: n = ii[1] - ii[0]; ii++;
2377: ncols = n*bs;
2378: workt = work;
2379: for (j=0; j<n; j++) {
2380: xb = x + bs*(*idx++);
2381: for (k=0; k<bs; k++) workt[k] = xb[k];
2382: workt += bs;
2383: }
2384: if (usecprow) z = zarray + bs*ridx[i];
2385: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2386: v += n*bs2;
2387: if (!usecprow) z += bs;
2388: }
2389: VecRestoreArrayRead(xx,&x);
2390: VecRestoreArray(zz,&zarray);
2391: PetscLogFlops(2.0*a->nz*bs2);
2392: return(0);
2393: }
2395: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2396: {
2397: PetscScalar zero = 0.0;
2401: VecSet(zz,zero);
2402: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2403: return(0);
2404: }
2406: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2407: {
2408: PetscScalar zero = 0.0;
2412: VecSet(zz,zero);
2413: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2414: return(0);
2415: }
2417: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2418: {
2419: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2420: PetscScalar *z,x1,x2,x3,x4,x5;
2421: const PetscScalar *x,*xb = NULL;
2422: const MatScalar *v;
2423: PetscErrorCode ierr;
2424: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
2425: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2426: Mat_CompressedRow cprow = a->compressedrow;
2427: PetscBool usecprow = cprow.use;
2430: if (yy != zz) { VecCopy(yy,zz); }
2431: VecGetArrayRead(xx,&x);
2432: VecGetArray(zz,&z);
2434: idx = a->j;
2435: v = a->a;
2436: if (usecprow) {
2437: mbs = cprow.nrows;
2438: ii = cprow.i;
2439: ridx = cprow.rindex;
2440: } else {
2441: mbs=a->mbs;
2442: ii = a->i;
2443: xb = x;
2444: }
2446: switch (bs) {
2447: case 1:
2448: for (i=0; i<mbs; i++) {
2449: if (usecprow) xb = x + ridx[i];
2450: x1 = xb[0];
2451: ib = idx + ii[0];
2452: n = ii[1] - ii[0]; ii++;
2453: for (j=0; j<n; j++) {
2454: rval = ib[j];
2455: z[rval] += PetscConj(*v) * x1;
2456: v++;
2457: }
2458: if (!usecprow) xb++;
2459: }
2460: break;
2461: case 2:
2462: for (i=0; i<mbs; i++) {
2463: if (usecprow) xb = x + 2*ridx[i];
2464: x1 = xb[0]; x2 = xb[1];
2465: ib = idx + ii[0];
2466: n = ii[1] - ii[0]; ii++;
2467: for (j=0; j<n; j++) {
2468: rval = ib[j]*2;
2469: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2470: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2471: v += 4;
2472: }
2473: if (!usecprow) xb += 2;
2474: }
2475: break;
2476: case 3:
2477: for (i=0; i<mbs; i++) {
2478: if (usecprow) xb = x + 3*ridx[i];
2479: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2480: ib = idx + ii[0];
2481: n = ii[1] - ii[0]; ii++;
2482: for (j=0; j<n; j++) {
2483: rval = ib[j]*3;
2484: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2485: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2486: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2487: v += 9;
2488: }
2489: if (!usecprow) xb += 3;
2490: }
2491: break;
2492: case 4:
2493: for (i=0; i<mbs; i++) {
2494: if (usecprow) xb = x + 4*ridx[i];
2495: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2496: ib = idx + ii[0];
2497: n = ii[1] - ii[0]; ii++;
2498: for (j=0; j<n; j++) {
2499: rval = ib[j]*4;
2500: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2501: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2502: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2503: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2504: v += 16;
2505: }
2506: if (!usecprow) xb += 4;
2507: }
2508: break;
2509: case 5:
2510: for (i=0; i<mbs; i++) {
2511: if (usecprow) xb = x + 5*ridx[i];
2512: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2513: x4 = xb[3]; x5 = xb[4];
2514: ib = idx + ii[0];
2515: n = ii[1] - ii[0]; ii++;
2516: for (j=0; j<n; j++) {
2517: rval = ib[j]*5;
2518: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2519: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2520: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2521: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2522: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2523: v += 25;
2524: }
2525: if (!usecprow) xb += 5;
2526: }
2527: break;
2528: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2529: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2530: #if 0
2531: {
2532: PetscInt ncols,k,bs2=a->bs2;
2533: PetscScalar *work,*workt,zb;
2534: const PetscScalar *xtmp;
2535: if (!a->mult_work) {
2536: k = PetscMax(A->rmap->n,A->cmap->n);
2537: PetscMalloc1(k+1,&a->mult_work);
2538: }
2539: work = a->mult_work;
2540: xtmp = x;
2541: for (i=0; i<mbs; i++) {
2542: n = ii[1] - ii[0]; ii++;
2543: ncols = n*bs;
2544: PetscArrayzero(work,ncols);
2545: if (usecprow) xtmp = x + bs*ridx[i];
2546: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2547: v += n*bs2;
2548: if (!usecprow) xtmp += bs;
2549: workt = work;
2550: for (j=0; j<n; j++) {
2551: zb = z + bs*(*idx++);
2552: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2553: workt += bs;
2554: }
2555: }
2556: }
2557: #endif
2558: }
2559: VecRestoreArrayRead(xx,&x);
2560: VecRestoreArray(zz,&z);
2561: PetscLogFlops(2.0*a->nz*a->bs2);
2562: return(0);
2563: }
2565: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2566: {
2567: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2568: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2569: const PetscScalar *x,*xb = NULL;
2570: const MatScalar *v;
2571: PetscErrorCode ierr;
2572: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2573: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2574: Mat_CompressedRow cprow = a->compressedrow;
2575: PetscBool usecprow=cprow.use;
2578: if (yy != zz) { VecCopy(yy,zz); }
2579: VecGetArrayRead(xx,&x);
2580: VecGetArray(zz,&z);
2582: idx = a->j;
2583: v = a->a;
2584: if (usecprow) {
2585: mbs = cprow.nrows;
2586: ii = cprow.i;
2587: ridx = cprow.rindex;
2588: } else {
2589: mbs=a->mbs;
2590: ii = a->i;
2591: xb = x;
2592: }
2594: switch (bs) {
2595: case 1:
2596: for (i=0; i<mbs; i++) {
2597: if (usecprow) xb = x + ridx[i];
2598: x1 = xb[0];
2599: ib = idx + ii[0];
2600: n = ii[1] - ii[0]; ii++;
2601: for (j=0; j<n; j++) {
2602: rval = ib[j];
2603: z[rval] += *v * x1;
2604: v++;
2605: }
2606: if (!usecprow) xb++;
2607: }
2608: break;
2609: case 2:
2610: for (i=0; i<mbs; i++) {
2611: if (usecprow) xb = x + 2*ridx[i];
2612: x1 = xb[0]; x2 = xb[1];
2613: ib = idx + ii[0];
2614: n = ii[1] - ii[0]; ii++;
2615: for (j=0; j<n; j++) {
2616: rval = ib[j]*2;
2617: z[rval++] += v[0]*x1 + v[1]*x2;
2618: z[rval++] += v[2]*x1 + v[3]*x2;
2619: v += 4;
2620: }
2621: if (!usecprow) xb += 2;
2622: }
2623: break;
2624: case 3:
2625: for (i=0; i<mbs; i++) {
2626: if (usecprow) xb = x + 3*ridx[i];
2627: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2628: ib = idx + ii[0];
2629: n = ii[1] - ii[0]; ii++;
2630: for (j=0; j<n; j++) {
2631: rval = ib[j]*3;
2632: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2633: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2634: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2635: v += 9;
2636: }
2637: if (!usecprow) xb += 3;
2638: }
2639: break;
2640: case 4:
2641: for (i=0; i<mbs; i++) {
2642: if (usecprow) xb = x + 4*ridx[i];
2643: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2644: ib = idx + ii[0];
2645: n = ii[1] - ii[0]; ii++;
2646: for (j=0; j<n; j++) {
2647: rval = ib[j]*4;
2648: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2649: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2650: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2651: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2652: v += 16;
2653: }
2654: if (!usecprow) xb += 4;
2655: }
2656: break;
2657: case 5:
2658: for (i=0; i<mbs; i++) {
2659: if (usecprow) xb = x + 5*ridx[i];
2660: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2661: x4 = xb[3]; x5 = xb[4];
2662: ib = idx + ii[0];
2663: n = ii[1] - ii[0]; ii++;
2664: for (j=0; j<n; j++) {
2665: rval = ib[j]*5;
2666: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2667: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2668: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2669: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2670: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2671: v += 25;
2672: }
2673: if (!usecprow) xb += 5;
2674: }
2675: break;
2676: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2677: PetscInt ncols,k;
2678: PetscScalar *work,*workt;
2679: const PetscScalar *xtmp;
2680: if (!a->mult_work) {
2681: k = PetscMax(A->rmap->n,A->cmap->n);
2682: PetscMalloc1(k+1,&a->mult_work);
2683: }
2684: work = a->mult_work;
2685: xtmp = x;
2686: for (i=0; i<mbs; i++) {
2687: n = ii[1] - ii[0]; ii++;
2688: ncols = n*bs;
2689: PetscArrayzero(work,ncols);
2690: if (usecprow) xtmp = x + bs*ridx[i];
2691: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2692: v += n*bs2;
2693: if (!usecprow) xtmp += bs;
2694: workt = work;
2695: for (j=0; j<n; j++) {
2696: zb = z + bs*(*idx++);
2697: for (k=0; k<bs; k++) zb[k] += workt[k];
2698: workt += bs;
2699: }
2700: }
2701: }
2702: }
2703: VecRestoreArrayRead(xx,&x);
2704: VecRestoreArray(zz,&z);
2705: PetscLogFlops(2.0*a->nz*a->bs2);
2706: return(0);
2707: }
2709: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2710: {
2711: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2712: PetscInt totalnz = a->bs2*a->nz;
2713: PetscScalar oalpha = alpha;
2715: PetscBLASInt one = 1,tnz;
2718: PetscBLASIntCast(totalnz,&tnz);
2719: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2720: PetscLogFlops(totalnz);
2721: return(0);
2722: }
2724: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2725: {
2727: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2728: MatScalar *v = a->a;
2729: PetscReal sum = 0.0;
2730: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2733: if (type == NORM_FROBENIUS) {
2734: #if defined(PETSC_USE_REAL___FP16)
2735: PetscBLASInt one = 1,cnt = bs2*nz;
2736: PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2737: #else
2738: for (i=0; i<bs2*nz; i++) {
2739: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2740: }
2741: #endif
2742: *norm = PetscSqrtReal(sum);
2743: PetscLogFlops(2.0*bs2*nz);
2744: } else if (type == NORM_1) { /* maximum column sum */
2745: PetscReal *tmp;
2746: PetscInt *bcol = a->j;
2747: PetscCalloc1(A->cmap->n+1,&tmp);
2748: for (i=0; i<nz; i++) {
2749: for (j=0; j<bs; j++) {
2750: k1 = bs*(*bcol) + j; /* column index */
2751: for (k=0; k<bs; k++) {
2752: tmp[k1] += PetscAbsScalar(*v); v++;
2753: }
2754: }
2755: bcol++;
2756: }
2757: *norm = 0.0;
2758: for (j=0; j<A->cmap->n; j++) {
2759: if (tmp[j] > *norm) *norm = tmp[j];
2760: }
2761: PetscFree(tmp);
2762: PetscLogFlops(PetscMax(bs2*nz-1,0));
2763: } else if (type == NORM_INFINITY) { /* maximum row sum */
2764: *norm = 0.0;
2765: for (k=0; k<bs; k++) {
2766: for (j=0; j<a->mbs; j++) {
2767: v = a->a + bs2*a->i[j] + k;
2768: sum = 0.0;
2769: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2770: for (k1=0; k1<bs; k1++) {
2771: sum += PetscAbsScalar(*v);
2772: v += bs;
2773: }
2774: }
2775: if (sum > *norm) *norm = sum;
2776: }
2777: }
2778: PetscLogFlops(PetscMax(bs2*nz-1,0));
2779: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2780: return(0);
2781: }
2783: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2784: {
2785: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2789: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2790: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2791: *flg = PETSC_FALSE;
2792: return(0);
2793: }
2795: /* if the a->i are the same */
2796: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2797: if (!*flg) return(0);
2799: /* if a->j are the same */
2800: PetscArraycmp(a->j,b->j,a->nz,flg);
2801: if (!*flg) return(0);
2803: /* if a->a are the same */
2804: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2805: return(0);
2807: }
2809: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2810: {
2811: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2813: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2814: PetscScalar *x,zero = 0.0;
2815: MatScalar *aa,*aa_j;
2818: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2819: bs = A->rmap->bs;
2820: aa = a->a;
2821: ai = a->i;
2822: aj = a->j;
2823: ambs = a->mbs;
2824: bs2 = a->bs2;
2826: VecSet(v,zero);
2827: VecGetArray(v,&x);
2828: VecGetLocalSize(v,&n);
2829: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2830: for (i=0; i<ambs; i++) {
2831: for (j=ai[i]; j<ai[i+1]; j++) {
2832: if (aj[j] == i) {
2833: row = i*bs;
2834: aa_j = aa+j*bs2;
2835: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2836: break;
2837: }
2838: }
2839: }
2840: VecRestoreArray(v,&x);
2841: return(0);
2842: }
2844: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2845: {
2846: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2847: const PetscScalar *l,*r,*li,*ri;
2848: PetscScalar x;
2849: MatScalar *aa, *v;
2850: PetscErrorCode ierr;
2851: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2852: const PetscInt *ai,*aj;
2855: ai = a->i;
2856: aj = a->j;
2857: aa = a->a;
2858: m = A->rmap->n;
2859: n = A->cmap->n;
2860: bs = A->rmap->bs;
2861: mbs = a->mbs;
2862: bs2 = a->bs2;
2863: if (ll) {
2864: VecGetArrayRead(ll,&l);
2865: VecGetLocalSize(ll,&lm);
2866: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2867: for (i=0; i<mbs; i++) { /* for each block row */
2868: M = ai[i+1] - ai[i];
2869: li = l + i*bs;
2870: v = aa + bs2*ai[i];
2871: for (j=0; j<M; j++) { /* for each block */
2872: for (k=0; k<bs2; k++) {
2873: (*v++) *= li[k%bs];
2874: }
2875: }
2876: }
2877: VecRestoreArrayRead(ll,&l);
2878: PetscLogFlops(a->nz);
2879: }
2881: if (rr) {
2882: VecGetArrayRead(rr,&r);
2883: VecGetLocalSize(rr,&rn);
2884: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2885: for (i=0; i<mbs; i++) { /* for each block row */
2886: iai = ai[i];
2887: M = ai[i+1] - iai;
2888: v = aa + bs2*iai;
2889: for (j=0; j<M; j++) { /* for each block */
2890: ri = r + bs*aj[iai+j];
2891: for (k=0; k<bs; k++) {
2892: x = ri[k];
2893: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2894: v += bs;
2895: }
2896: }
2897: }
2898: VecRestoreArrayRead(rr,&r);
2899: PetscLogFlops(a->nz);
2900: }
2901: return(0);
2902: }
2904: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2905: {
2906: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2909: info->block_size = a->bs2;
2910: info->nz_allocated = a->bs2*a->maxnz;
2911: info->nz_used = a->bs2*a->nz;
2912: info->nz_unneeded = info->nz_allocated - info->nz_used;
2913: info->assemblies = A->num_ass;
2914: info->mallocs = A->info.mallocs;
2915: info->memory = ((PetscObject)A)->mem;
2916: if (A->factortype) {
2917: info->fill_ratio_given = A->info.fill_ratio_given;
2918: info->fill_ratio_needed = A->info.fill_ratio_needed;
2919: info->factor_mallocs = A->info.factor_mallocs;
2920: } else {
2921: info->fill_ratio_given = 0;
2922: info->fill_ratio_needed = 0;
2923: info->factor_mallocs = 0;
2924: }
2925: return(0);
2926: }
2928: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2929: {
2930: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2934: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2935: return(0);
2936: }
2938: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2939: {
2943: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2944: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2945: return(0);
2946: }
2948: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2949: {
2950: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2951: PetscScalar *z = NULL,sum1;
2952: const PetscScalar *xb;
2953: PetscScalar x1;
2954: const MatScalar *v,*vv;
2955: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2956: PetscBool usecprow=a->compressedrow.use;
2959: idx = a->j;
2960: v = a->a;
2961: if (usecprow) {
2962: mbs = a->compressedrow.nrows;
2963: ii = a->compressedrow.i;
2964: ridx = a->compressedrow.rindex;
2965: } else {
2966: mbs = a->mbs;
2967: ii = a->i;
2968: z = c;
2969: }
2971: for (i=0; i<mbs; i++) {
2972: n = ii[1] - ii[0]; ii++;
2973: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2974: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2975: if (usecprow) z = c + ridx[i];
2976: jj = idx;
2977: vv = v;
2978: for (k=0; k<cn; k++) {
2979: idx = jj;
2980: v = vv;
2981: sum1 = 0.0;
2982: for (j=0; j<n; j++) {
2983: xb = b + (*idx++); x1 = xb[0+k*bm];
2984: sum1 += v[0]*x1;
2985: v += 1;
2986: }
2987: z[0+k*cm] = sum1;
2988: }
2989: if (!usecprow) z += 1;
2990: }
2991: return(0);
2992: }
2994: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2995: {
2996: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2997: PetscScalar *z = NULL,sum1,sum2;
2998: const PetscScalar *xb;
2999: PetscScalar x1,x2;
3000: const MatScalar *v,*vv;
3001: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3002: PetscBool usecprow=a->compressedrow.use;
3005: idx = a->j;
3006: v = a->a;
3007: if (usecprow) {
3008: mbs = a->compressedrow.nrows;
3009: ii = a->compressedrow.i;
3010: ridx = a->compressedrow.rindex;
3011: } else {
3012: mbs = a->mbs;
3013: ii = a->i;
3014: z = c;
3015: }
3017: for (i=0; i<mbs; i++) {
3018: n = ii[1] - ii[0]; ii++;
3019: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3020: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3021: if (usecprow) z = c + 2*ridx[i];
3022: jj = idx;
3023: vv = v;
3024: for (k=0; k<cn; k++) {
3025: idx = jj;
3026: v = vv;
3027: sum1 = 0.0; sum2 = 0.0;
3028: for (j=0; j<n; j++) {
3029: xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3030: sum1 += v[0]*x1 + v[2]*x2;
3031: sum2 += v[1]*x1 + v[3]*x2;
3032: v += 4;
3033: }
3034: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3035: }
3036: if (!usecprow) z += 2;
3037: }
3038: return(0);
3039: }
3041: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3042: {
3043: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3044: PetscScalar *z = NULL,sum1,sum2,sum3;
3045: const PetscScalar *xb;
3046: PetscScalar x1,x2,x3;
3047: const MatScalar *v,*vv;
3048: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3049: PetscBool usecprow=a->compressedrow.use;
3052: idx = a->j;
3053: v = a->a;
3054: if (usecprow) {
3055: mbs = a->compressedrow.nrows;
3056: ii = a->compressedrow.i;
3057: ridx = a->compressedrow.rindex;
3058: } else {
3059: mbs = a->mbs;
3060: ii = a->i;
3061: z = c;
3062: }
3064: for (i=0; i<mbs; i++) {
3065: n = ii[1] - ii[0]; ii++;
3066: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3067: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3068: if (usecprow) z = c + 3*ridx[i];
3069: jj = idx;
3070: vv = v;
3071: for (k=0; k<cn; k++) {
3072: idx = jj;
3073: v = vv;
3074: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
3075: for (j=0; j<n; j++) {
3076: xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3077: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3078: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3079: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3080: v += 9;
3081: }
3082: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3083: }
3084: if (!usecprow) z += 3;
3085: }
3086: return(0);
3087: }
3089: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3090: {
3091: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3092: PetscScalar *z = NULL,sum1,sum2,sum3,sum4;
3093: const PetscScalar *xb;
3094: PetscScalar x1,x2,x3,x4;
3095: const MatScalar *v,*vv;
3096: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3097: PetscBool usecprow=a->compressedrow.use;
3100: idx = a->j;
3101: v = a->a;
3102: if (usecprow) {
3103: mbs = a->compressedrow.nrows;
3104: ii = a->compressedrow.i;
3105: ridx = a->compressedrow.rindex;
3106: } else {
3107: mbs = a->mbs;
3108: ii = a->i;
3109: z = c;
3110: }
3112: for (i=0; i<mbs; i++) {
3113: n = ii[1] - ii[0]; ii++;
3114: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3115: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3116: if (usecprow) z = c + 4*ridx[i];
3117: jj = idx;
3118: vv = v;
3119: for (k=0; k<cn; k++) {
3120: idx = jj;
3121: v = vv;
3122: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3123: for (j=0; j<n; j++) {
3124: xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3125: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3126: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3127: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3128: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3129: v += 16;
3130: }
3131: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3132: }
3133: if (!usecprow) z += 4;
3134: }
3135: return(0);
3136: }
3138: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3139: {
3140: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3141: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5;
3142: const PetscScalar *xb;
3143: PetscScalar x1,x2,x3,x4,x5;
3144: const MatScalar *v,*vv;
3145: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3146: PetscBool usecprow=a->compressedrow.use;
3149: idx = a->j;
3150: v = a->a;
3151: if (usecprow) {
3152: mbs = a->compressedrow.nrows;
3153: ii = a->compressedrow.i;
3154: ridx = a->compressedrow.rindex;
3155: } else {
3156: mbs = a->mbs;
3157: ii = a->i;
3158: z = c;
3159: }
3161: for (i=0; i<mbs; i++) {
3162: n = ii[1] - ii[0]; ii++;
3163: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3164: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3165: if (usecprow) z = c + 5*ridx[i];
3166: jj = idx;
3167: vv = v;
3168: for (k=0; k<cn; k++) {
3169: idx = jj;
3170: v = vv;
3171: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3172: for (j=0; j<n; j++) {
3173: xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3174: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3175: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3176: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3177: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3178: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3179: v += 25;
3180: }
3181: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3182: }
3183: if (!usecprow) z += 5;
3184: }
3185: return(0);
3186: }
3188: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3189: {
3190: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3191: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
3192: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
3193: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3194: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3195: PetscBLASInt bbs,bcn,bbm,bcm;
3196: PetscScalar *z = NULL;
3197: PetscScalar *c,*b;
3198: const MatScalar *v;
3199: const PetscInt *idx,*ii,*ridx=NULL;
3200: PetscScalar _DZero=0.0,_DOne=1.0;
3201: PetscBool usecprow=a->compressedrow.use;
3202: PetscErrorCode ierr;
3205: if (!cm || !cn) return(0);
3206: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3207: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3208: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3209: b = bd->v;
3210: if (a->nonzerorowcnt != A->rmap->n) {
3211: MatZeroEntries(C);
3212: }
3213: MatDenseGetArray(C,&c);
3214: switch (bs) {
3215: case 1:
3216: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3217: break;
3218: case 2:
3219: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3220: break;
3221: case 3:
3222: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3223: break;
3224: case 4:
3225: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3226: break;
3227: case 5:
3228: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3229: break;
3230: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3231: PetscBLASIntCast(bs,&bbs);
3232: PetscBLASIntCast(cn,&bcn);
3233: PetscBLASIntCast(bm,&bbm);
3234: PetscBLASIntCast(cm,&bcm);
3235: idx = a->j;
3236: v = a->a;
3237: if (usecprow) {
3238: mbs = a->compressedrow.nrows;
3239: ii = a->compressedrow.i;
3240: ridx = a->compressedrow.rindex;
3241: } else {
3242: mbs = a->mbs;
3243: ii = a->i;
3244: z = c;
3245: }
3246: for (i=0; i<mbs; i++) {
3247: n = ii[1] - ii[0]; ii++;
3248: if (usecprow) z = c + bs*ridx[i];
3249: if (n) {
3250: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3251: v += bs2;
3252: }
3253: for (j=1; j<n; j++) {
3254: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3255: v += bs2;
3256: }
3257: if (!usecprow) z += bs;
3258: }
3259: }
3260: MatDenseRestoreArray(C,&c);
3261: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3262: return(0);
3263: }