Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
15: const PetscInt *idx;
16: PetscInt start,end,*ai,*aj,bs,*nidx2;
17: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
26: PetscBTCreate(m,&table);
27: PetscMalloc1(m+1,&nidx);
28: PetscMalloc1(A->rmap->N+1,&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n; ++j) {
41: ival = idx[j]/bs; /* convert the indices into block indices */
43: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return 0;
71: }
73: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
74: {
75: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
76: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
77: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
78: const PetscInt *irow,*icol;
79: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
80: PetscInt *aj = a->j,*ai = a->i;
81: MatScalar *mat_a;
82: Mat C;
83: PetscBool flag;
85: ISGetIndices(isrow,&irow);
86: ISGetIndices(iscol,&icol);
87: ISGetLocalSize(isrow,&nrows);
88: ISGetLocalSize(iscol,&ncols);
90: PetscCalloc1(1+oldcols,&smap);
91: ssmap = smap;
92: PetscMalloc1(1+nrows,&lens);
93: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94: /* determine lens of each row */
95: for (i=0; i<nrows; i++) {
96: kstart = ai[irow[i]];
97: kend = kstart + a->ilen[irow[i]];
98: lens[i] = 0;
99: for (k=kstart; k<kend; k++) {
100: if (ssmap[aj[k]]) lens[i]++;
101: }
102: }
103: /* Create and fill new matrix */
104: if (scall == MAT_REUSE_MATRIX) {
105: c = (Mat_SeqBAIJ*)((*B)->data);
108: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
110: PetscArrayzero(c->ilen,c->mbs);
111: C = *B;
112: } else {
113: MatCreate(PetscObjectComm((PetscObject)A),&C);
114: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
115: MatSetType(C,((PetscObject)A)->type_name);
116: MatSeqBAIJSetPreallocation(C,bs,0,lens);
117: }
118: c = (Mat_SeqBAIJ*)(C->data);
119: for (i=0; i<nrows; i++) {
120: row = irow[i];
121: kstart = ai[row];
122: kend = kstart + a->ilen[row];
123: mat_i = c->i[i];
124: mat_j = c->j + mat_i;
125: mat_a = c->a + mat_i*bs2;
126: mat_ilen = c->ilen + i;
127: for (k=kstart; k<kend; k++) {
128: if ((tcol=ssmap[a->j[k]])) {
129: *mat_j++ = tcol - 1;
130: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
131: mat_a += bs2;
132: (*mat_ilen)++;
133: }
134: }
135: }
136: /* sort */
137: {
138: MatScalar *work;
139: PetscMalloc1(bs2,&work);
140: for (i=0; i<nrows; i++) {
141: PetscInt ilen;
142: mat_i = c->i[i];
143: mat_j = c->j + mat_i;
144: mat_a = c->a + mat_i*bs2;
145: ilen = c->ilen[i];
146: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
147: }
148: PetscFree(work);
149: }
151: /* Free work space */
152: ISRestoreIndices(iscol,&icol);
153: PetscFree(smap);
154: PetscFree(lens);
155: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
158: ISRestoreIndices(isrow,&irow);
159: *B = C;
160: return 0;
161: }
163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: IS is1,is2;
167: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
168: const PetscInt *irow,*icol;
170: ISGetIndices(isrow,&irow);
171: ISGetIndices(iscol,&icol);
172: ISGetLocalSize(isrow,&nrows);
173: ISGetLocalSize(iscol,&ncols);
175: /* Verify if the indices corespond to each element in a block
176: and form the IS with compressed IS */
177: maxmnbs = PetscMax(a->mbs,a->nbs);
178: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
179: PetscArrayzero(vary,a->mbs);
180: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
181: for (i=0; i<a->mbs; i++) {
183: }
184: count = 0;
185: for (i=0; i<nrows; i++) {
186: j = irow[i] / bs;
187: if ((vary[j]--)==bs) iary[count++] = j;
188: }
189: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
191: PetscArrayzero(vary,a->nbs);
192: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
193: for (i=0; i<a->nbs; i++) {
195: }
196: count = 0;
197: for (i=0; i<ncols; i++) {
198: j = icol[i] / bs;
199: if ((vary[j]--)==bs) iary[count++] = j;
200: }
201: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
202: ISRestoreIndices(isrow,&irow);
203: ISRestoreIndices(iscol,&icol);
204: PetscFree2(vary,iary);
206: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
207: ISDestroy(&is1);
208: ISDestroy(&is2);
209: return 0;
210: }
212: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
213: {
214: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
215: Mat_SubSppt *submatj = c->submatis1;
217: (*submatj->destroy)(C);
218: MatDestroySubMatrix_Private(submatj);
219: return 0;
220: }
222: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
223: {
224: PetscInt i;
225: Mat C;
226: Mat_SeqBAIJ *c;
227: Mat_SubSppt *submatj;
229: for (i=0; i<n; i++) {
230: C = (*mat)[i];
231: c = (Mat_SeqBAIJ*)C->data;
232: submatj = c->submatis1;
233: if (submatj) {
234: if (--((PetscObject)C)->refct <= 0) {
235: (*submatj->destroy)(C);
236: MatDestroySubMatrix_Private(submatj);
237: PetscFree(C->defaultvectype);
238: PetscLayoutDestroy(&C->rmap);
239: PetscLayoutDestroy(&C->cmap);
240: PetscHeaderDestroy(&C);
241: }
242: } else {
243: MatDestroy(&C);
244: }
245: }
247: /* Destroy Dummy submatrices created for reuse */
248: MatDestroySubMatrices_Dummy(n,mat);
250: PetscFree(*mat);
251: return 0;
252: }
254: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
255: {
256: PetscInt i;
258: if (scall == MAT_INITIAL_MATRIX) {
259: PetscCalloc1(n+1,B);
260: }
262: for (i=0; i<n; i++) {
263: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
264: }
265: return 0;
266: }
268: /* -------------------------------------------------------*/
269: /* Should check that shapes of vectors and matrices match */
270: /* -------------------------------------------------------*/
272: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
273: {
274: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
275: PetscScalar *z,sum;
276: const PetscScalar *x;
277: const MatScalar *v;
278: PetscInt mbs,i,n;
279: const PetscInt *idx,*ii,*ridx=NULL;
280: PetscBool usecprow=a->compressedrow.use;
282: VecGetArrayRead(xx,&x);
283: VecGetArrayWrite(zz,&z);
285: if (usecprow) {
286: mbs = a->compressedrow.nrows;
287: ii = a->compressedrow.i;
288: ridx = a->compressedrow.rindex;
289: PetscArrayzero(z,a->mbs);
290: } else {
291: mbs = a->mbs;
292: ii = a->i;
293: }
295: for (i=0; i<mbs; i++) {
296: n = ii[1] - ii[0];
297: v = a->a + ii[0];
298: idx = a->j + ii[0];
299: ii++;
300: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
301: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
302: sum = 0.0;
303: PetscSparseDensePlusDot(sum,x,v,idx,n);
304: if (usecprow) {
305: z[ridx[i]] = sum;
306: } else {
307: z[i] = sum;
308: }
309: }
310: VecRestoreArrayRead(xx,&x);
311: VecRestoreArrayWrite(zz,&z);
312: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
313: return 0;
314: }
316: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
317: {
318: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
319: PetscScalar *z = NULL,sum1,sum2,*zarray;
320: const PetscScalar *x,*xb;
321: PetscScalar x1,x2;
322: const MatScalar *v;
323: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
324: PetscBool usecprow=a->compressedrow.use;
326: VecGetArrayRead(xx,&x);
327: VecGetArrayWrite(zz,&zarray);
329: idx = a->j;
330: v = a->a;
331: if (usecprow) {
332: mbs = a->compressedrow.nrows;
333: ii = a->compressedrow.i;
334: ridx = a->compressedrow.rindex;
335: PetscArrayzero(zarray,2*a->mbs);
336: } else {
337: mbs = a->mbs;
338: ii = a->i;
339: z = zarray;
340: }
342: for (i=0; i<mbs; i++) {
343: n = ii[1] - ii[0]; ii++;
344: sum1 = 0.0; sum2 = 0.0;
345: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
346: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
347: for (j=0; j<n; j++) {
348: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
349: sum1 += v[0]*x1 + v[2]*x2;
350: sum2 += v[1]*x1 + v[3]*x2;
351: v += 4;
352: }
353: if (usecprow) z = zarray + 2*ridx[i];
354: z[0] = sum1; z[1] = sum2;
355: if (!usecprow) z += 2;
356: }
357: VecRestoreArrayRead(xx,&x);
358: VecRestoreArrayWrite(zz,&zarray);
359: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
360: return 0;
361: }
363: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
364: {
365: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
366: PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
367: const PetscScalar *x,*xb;
368: const MatScalar *v;
369: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
370: PetscBool usecprow=a->compressedrow.use;
372: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
373: #pragma disjoint(*v,*z,*xb)
374: #endif
376: VecGetArrayRead(xx,&x);
377: VecGetArrayWrite(zz,&zarray);
379: idx = a->j;
380: v = a->a;
381: if (usecprow) {
382: mbs = a->compressedrow.nrows;
383: ii = a->compressedrow.i;
384: ridx = a->compressedrow.rindex;
385: PetscArrayzero(zarray,3*a->mbs);
386: } else {
387: mbs = a->mbs;
388: ii = a->i;
389: z = zarray;
390: }
392: for (i=0; i<mbs; i++) {
393: n = ii[1] - ii[0]; ii++;
394: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
395: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
396: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
397: for (j=0; j<n; j++) {
398: xb = x + 3*(*idx++);
399: x1 = xb[0];
400: x2 = xb[1];
401: x3 = xb[2];
403: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
404: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
405: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
406: v += 9;
407: }
408: if (usecprow) z = zarray + 3*ridx[i];
409: z[0] = sum1; z[1] = sum2; z[2] = sum3;
410: if (!usecprow) z += 3;
411: }
412: VecRestoreArrayRead(xx,&x);
413: VecRestoreArrayWrite(zz,&zarray);
414: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
415: return 0;
416: }
418: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
419: {
420: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
421: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
422: const PetscScalar *x,*xb;
423: const MatScalar *v;
424: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
425: PetscBool usecprow=a->compressedrow.use;
427: VecGetArrayRead(xx,&x);
428: VecGetArrayWrite(zz,&zarray);
430: idx = a->j;
431: v = a->a;
432: if (usecprow) {
433: mbs = a->compressedrow.nrows;
434: ii = a->compressedrow.i;
435: ridx = a->compressedrow.rindex;
436: PetscArrayzero(zarray,4*a->mbs);
437: } else {
438: mbs = a->mbs;
439: ii = a->i;
440: z = zarray;
441: }
443: for (i=0; i<mbs; i++) {
444: n = ii[1] - ii[0];
445: ii++;
446: sum1 = 0.0;
447: sum2 = 0.0;
448: sum3 = 0.0;
449: sum4 = 0.0;
451: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
452: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
453: for (j=0; j<n; j++) {
454: xb = x + 4*(*idx++);
455: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
456: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
457: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
458: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
459: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
460: v += 16;
461: }
462: if (usecprow) z = zarray + 4*ridx[i];
463: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
464: if (!usecprow) z += 4;
465: }
466: VecRestoreArrayRead(xx,&x);
467: VecRestoreArrayWrite(zz,&zarray);
468: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
469: return 0;
470: }
472: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
473: {
474: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
475: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
476: const PetscScalar *xb,*x;
477: const MatScalar *v;
478: const PetscInt *idx,*ii,*ridx=NULL;
479: PetscInt mbs,i,j,n;
480: PetscBool usecprow=a->compressedrow.use;
482: VecGetArrayRead(xx,&x);
483: VecGetArrayWrite(zz,&zarray);
485: idx = a->j;
486: v = a->a;
487: if (usecprow) {
488: mbs = a->compressedrow.nrows;
489: ii = a->compressedrow.i;
490: ridx = a->compressedrow.rindex;
491: PetscArrayzero(zarray,5*a->mbs);
492: } else {
493: mbs = a->mbs;
494: ii = a->i;
495: z = zarray;
496: }
498: for (i=0; i<mbs; i++) {
499: n = ii[1] - ii[0]; ii++;
500: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
501: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
502: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
503: for (j=0; j<n; j++) {
504: xb = x + 5*(*idx++);
505: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
506: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
507: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
508: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
509: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
510: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
511: v += 25;
512: }
513: if (usecprow) z = zarray + 5*ridx[i];
514: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
515: if (!usecprow) z += 5;
516: }
517: VecRestoreArrayRead(xx,&x);
518: VecRestoreArrayWrite(zz,&zarray);
519: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
520: return 0;
521: }
523: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
524: {
525: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
526: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
527: const PetscScalar *x,*xb;
528: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
529: const MatScalar *v;
530: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
531: PetscBool usecprow=a->compressedrow.use;
533: VecGetArrayRead(xx,&x);
534: VecGetArrayWrite(zz,&zarray);
536: idx = a->j;
537: v = a->a;
538: if (usecprow) {
539: mbs = a->compressedrow.nrows;
540: ii = a->compressedrow.i;
541: ridx = a->compressedrow.rindex;
542: PetscArrayzero(zarray,6*a->mbs);
543: } else {
544: mbs = a->mbs;
545: ii = a->i;
546: z = zarray;
547: }
549: for (i=0; i<mbs; i++) {
550: n = ii[1] - ii[0];
551: ii++;
552: sum1 = 0.0;
553: sum2 = 0.0;
554: sum3 = 0.0;
555: sum4 = 0.0;
556: sum5 = 0.0;
557: sum6 = 0.0;
559: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
560: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
561: for (j=0; j<n; j++) {
562: xb = x + 6*(*idx++);
563: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
564: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
565: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
566: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
567: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
568: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
569: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
570: v += 36;
571: }
572: if (usecprow) z = zarray + 6*ridx[i];
573: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
574: if (!usecprow) z += 6;
575: }
577: VecRestoreArrayRead(xx,&x);
578: VecRestoreArrayWrite(zz,&zarray);
579: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
580: return 0;
581: }
583: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
584: {
585: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
586: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
587: const PetscScalar *x,*xb;
588: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
589: const MatScalar *v;
590: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
591: PetscBool usecprow=a->compressedrow.use;
593: VecGetArrayRead(xx,&x);
594: VecGetArrayWrite(zz,&zarray);
596: idx = a->j;
597: v = a->a;
598: if (usecprow) {
599: mbs = a->compressedrow.nrows;
600: ii = a->compressedrow.i;
601: ridx = a->compressedrow.rindex;
602: PetscArrayzero(zarray,7*a->mbs);
603: } else {
604: mbs = a->mbs;
605: ii = a->i;
606: z = zarray;
607: }
609: for (i=0; i<mbs; i++) {
610: n = ii[1] - ii[0];
611: ii++;
612: sum1 = 0.0;
613: sum2 = 0.0;
614: sum3 = 0.0;
615: sum4 = 0.0;
616: sum5 = 0.0;
617: sum6 = 0.0;
618: sum7 = 0.0;
620: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
621: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
622: for (j=0; j<n; j++) {
623: xb = x + 7*(*idx++);
624: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
625: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
626: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
627: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
628: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
629: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
630: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
631: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
632: v += 49;
633: }
634: if (usecprow) z = zarray + 7*ridx[i];
635: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
636: if (!usecprow) z += 7;
637: }
639: VecRestoreArrayRead(xx,&x);
640: VecRestoreArrayWrite(zz,&zarray);
641: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
642: return 0;
643: }
645: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
646: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
647: {
648: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
649: PetscScalar *z = NULL,*work,*workt,*zarray;
650: const PetscScalar *x,*xb;
651: const MatScalar *v;
652: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
653: const PetscInt *idx,*ii,*ridx=NULL;
654: PetscInt k;
655: PetscBool usecprow=a->compressedrow.use;
657: __m256d a0,a1,a2,a3,a4,a5;
658: __m256d w0,w1,w2,w3;
659: __m256d z0,z1,z2;
660: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
662: VecGetArrayRead(xx,&x);
663: VecGetArrayWrite(zz,&zarray);
665: idx = a->j;
666: v = a->a;
667: if (usecprow) {
668: mbs = a->compressedrow.nrows;
669: ii = a->compressedrow.i;
670: ridx = a->compressedrow.rindex;
671: PetscArrayzero(zarray,bs*a->mbs);
672: } else {
673: mbs = a->mbs;
674: ii = a->i;
675: z = zarray;
676: }
678: if (!a->mult_work) {
679: k = PetscMax(A->rmap->n,A->cmap->n);
680: PetscMalloc1(k+1,&a->mult_work);
681: }
683: work = a->mult_work;
684: for (i=0; i<mbs; i++) {
685: n = ii[1] - ii[0]; ii++;
686: workt = work;
687: for (j=0; j<n; j++) {
688: xb = x + bs*(*idx++);
689: for (k=0; k<bs; k++) workt[k] = xb[k];
690: workt += bs;
691: }
692: if (usecprow) z = zarray + bs*ridx[i];
694: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
696: for (j=0; j<n; j++) {
697: /* first column of a */
698: w0 = _mm256_set1_pd(work[j*9 ]);
699: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
700: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
701: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
703: /* second column of a */
704: w1 = _mm256_set1_pd(work[j*9+ 1]);
705: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
706: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
707: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
709: /* third column of a */
710: w2 = _mm256_set1_pd(work[j*9 +2]);
711: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
712: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
713: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
715: /* fourth column of a */
716: w3 = _mm256_set1_pd(work[j*9+ 3]);
717: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
718: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
719: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
721: /* fifth column of a */
722: w0 = _mm256_set1_pd(work[j*9+ 4]);
723: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
724: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
725: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
727: /* sixth column of a */
728: w1 = _mm256_set1_pd(work[j*9+ 5]);
729: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
730: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
731: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
733: /* seventh column of a */
734: w2 = _mm256_set1_pd(work[j*9+ 6]);
735: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
736: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
737: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
739: /* eighth column of a */
740: w3 = _mm256_set1_pd(work[j*9+ 7]);
741: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
742: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
743: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
745: /* ninth column of a */
746: w0 = _mm256_set1_pd(work[j*9+ 8]);
747: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
748: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
749: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
750: }
752: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
754: v += n*bs2;
755: if (!usecprow) z += bs;
756: }
757: VecRestoreArrayRead(xx,&x);
758: VecRestoreArrayWrite(zz,&zarray);
759: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
760: return 0;
761: }
762: #endif
764: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
765: {
766: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
767: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
768: const PetscScalar *x,*xb;
769: PetscScalar *zarray,xv;
770: const MatScalar *v;
771: const PetscInt *ii,*ij=a->j,*idx;
772: PetscInt mbs,i,j,k,n,*ridx=NULL;
773: PetscBool usecprow=a->compressedrow.use;
775: VecGetArrayRead(xx,&x);
776: VecGetArrayWrite(zz,&zarray);
778: v = a->a;
779: if (usecprow) {
780: mbs = a->compressedrow.nrows;
781: ii = a->compressedrow.i;
782: ridx = a->compressedrow.rindex;
783: PetscArrayzero(zarray,11*a->mbs);
784: } else {
785: mbs = a->mbs;
786: ii = a->i;
787: z = zarray;
788: }
790: for (i=0; i<mbs; i++) {
791: n = ii[i+1] - ii[i];
792: idx = ij + ii[i];
793: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
794: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
796: for (j=0; j<n; j++) {
797: xb = x + 11*(idx[j]);
799: for (k=0; k<11; k++) {
800: xv = xb[k];
801: sum1 += v[0]*xv;
802: sum2 += v[1]*xv;
803: sum3 += v[2]*xv;
804: sum4 += v[3]*xv;
805: sum5 += v[4]*xv;
806: sum6 += v[5]*xv;
807: sum7 += v[6]*xv;
808: sum8 += v[7]*xv;
809: sum9 += v[8]*xv;
810: sum10 += v[9]*xv;
811: sum11 += v[10]*xv;
812: v += 11;
813: }
814: }
815: if (usecprow) z = zarray + 11*ridx[i];
816: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
817: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
819: if (!usecprow) z += 11;
820: }
822: VecRestoreArrayRead(xx,&x);
823: VecRestoreArrayWrite(zz,&zarray);
824: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
825: return 0;
826: }
828: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
829: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
830: {
831: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
832: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
833: const PetscScalar *x,*xb;
834: PetscScalar *zarray,xv;
835: const MatScalar *v;
836: const PetscInt *ii,*ij=a->j,*idx;
837: PetscInt mbs,i,j,k,n,*ridx=NULL;
838: PetscBool usecprow=a->compressedrow.use;
840: VecGetArrayRead(xx,&x);
841: VecGetArrayWrite(zz,&zarray);
843: v = a->a;
844: if (usecprow) {
845: mbs = a->compressedrow.nrows;
846: ii = a->compressedrow.i;
847: ridx = a->compressedrow.rindex;
848: PetscArrayzero(zarray,12*a->mbs);
849: } else {
850: mbs = a->mbs;
851: ii = a->i;
852: z = zarray;
853: }
855: for (i=0; i<mbs; i++) {
856: n = ii[i+1] - ii[i];
857: idx = ij + ii[i];
858: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
859: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
861: for (j=0; j<n; j++) {
862: xb = x + 12*(idx[j]);
864: for (k=0; k<12; k++) {
865: xv = xb[k];
866: sum1 += v[0]*xv;
867: sum2 += v[1]*xv;
868: sum3 += v[2]*xv;
869: sum4 += v[3]*xv;
870: sum5 += v[4]*xv;
871: sum6 += v[5]*xv;
872: sum7 += v[6]*xv;
873: sum8 += v[7]*xv;
874: sum9 += v[8]*xv;
875: sum10 += v[9]*xv;
876: sum11 += v[10]*xv;
877: sum12 += v[11]*xv;
878: v += 12;
879: }
880: }
881: if (usecprow) z = zarray + 12*ridx[i];
882: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
883: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
884: if (!usecprow) z += 12;
885: }
886: VecRestoreArrayRead(xx,&x);
887: VecRestoreArrayWrite(zz,&zarray);
888: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
889: return 0;
890: }
892: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
893: {
894: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
895: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
896: const PetscScalar *x,*xb;
897: PetscScalar *zarray,*yarray,xv;
898: const MatScalar *v;
899: const PetscInt *ii,*ij=a->j,*idx;
900: PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL;
901: PetscBool usecprow=a->compressedrow.use;
903: VecGetArrayRead(xx,&x);
904: VecGetArrayPair(yy,zz,&yarray,&zarray);
906: v = a->a;
907: if (usecprow) {
908: if (zz != yy) {
909: PetscArraycpy(zarray,yarray,12*mbs);
910: }
911: mbs = a->compressedrow.nrows;
912: ii = a->compressedrow.i;
913: ridx = a->compressedrow.rindex;
914: } else {
915: ii = a->i;
916: y = yarray;
917: z = zarray;
918: }
920: for (i=0; i<mbs; i++) {
921: n = ii[i+1] - ii[i];
922: idx = ij + ii[i];
924: if (usecprow) {
925: y = yarray + 12*ridx[i];
926: z = zarray + 12*ridx[i];
927: }
928: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
929: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
931: for (j=0; j<n; j++) {
932: xb = x + 12*(idx[j]);
934: for (k=0; k<12; k++) {
935: xv = xb[k];
936: sum1 += v[0]*xv;
937: sum2 += v[1]*xv;
938: sum3 += v[2]*xv;
939: sum4 += v[3]*xv;
940: sum5 += v[4]*xv;
941: sum6 += v[5]*xv;
942: sum7 += v[6]*xv;
943: sum8 += v[7]*xv;
944: sum9 += v[8]*xv;
945: sum10 += v[9]*xv;
946: sum11 += v[10]*xv;
947: sum12 += v[11]*xv;
948: v += 12;
949: }
950: }
952: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
953: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
954: if (!usecprow) {
955: y += 12;
956: z += 12;
957: }
958: }
959: VecRestoreArrayRead(xx,&x);
960: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
961: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
962: return 0;
963: }
965: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
966: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
967: {
968: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
969: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
970: const PetscScalar *x,*xb;
971: PetscScalar x1,x2,x3,x4,*zarray;
972: const MatScalar *v;
973: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
974: PetscInt mbs,i,j,n;
975: PetscBool usecprow=a->compressedrow.use;
977: VecGetArrayRead(xx,&x);
978: VecGetArrayWrite(zz,&zarray);
980: v = a->a;
981: if (usecprow) {
982: mbs = a->compressedrow.nrows;
983: ii = a->compressedrow.i;
984: ridx = a->compressedrow.rindex;
985: PetscArrayzero(zarray,12*a->mbs);
986: } else {
987: mbs = a->mbs;
988: ii = a->i;
989: z = zarray;
990: }
992: for (i=0; i<mbs; i++) {
993: n = ii[i+1] - ii[i];
994: idx = ij + ii[i];
996: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
997: for (j=0; j<n; j++) {
998: xb = x + 12*(idx[j]);
999: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1001: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1002: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1003: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1004: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1005: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1006: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1007: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1008: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1009: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1010: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1011: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1012: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1013: v += 48;
1015: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1017: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1018: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1019: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1020: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1021: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1022: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1023: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1024: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1025: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1026: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1027: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1028: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1029: v += 48;
1031: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1032: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1033: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1034: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1035: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1036: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1037: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1038: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1039: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1040: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1041: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1042: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1043: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1044: v += 48;
1046: }
1047: if (usecprow) z = zarray + 12*ridx[i];
1048: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1049: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1050: if (!usecprow) z += 12;
1051: }
1052: VecRestoreArrayRead(xx,&x);
1053: VecRestoreArrayWrite(zz,&zarray);
1054: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1055: return 0;
1056: }
1058: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1059: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1060: {
1061: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1062: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1063: const PetscScalar *x,*xb;
1064: PetscScalar x1,x2,x3,x4,*zarray,*yarray;
1065: const MatScalar *v;
1066: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1067: PetscInt mbs = a->mbs,i,j,n;
1068: PetscBool usecprow=a->compressedrow.use;
1070: VecGetArrayRead(xx,&x);
1071: VecGetArrayPair(yy,zz,&yarray,&zarray);
1073: v = a->a;
1074: if (usecprow) {
1075: if (zz != yy) {
1076: PetscArraycpy(zarray,yarray,12*mbs);
1077: }
1078: mbs = a->compressedrow.nrows;
1079: ii = a->compressedrow.i;
1080: ridx = a->compressedrow.rindex;
1081: } else {
1082: ii = a->i;
1083: y = yarray;
1084: z = zarray;
1085: }
1087: for (i=0; i<mbs; i++) {
1088: n = ii[i+1] - ii[i];
1089: idx = ij + ii[i];
1091: if (usecprow) {
1092: y = yarray + 12*ridx[i];
1093: z = zarray + 12*ridx[i];
1094: }
1095: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1096: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1098: for (j=0; j<n; j++) {
1099: xb = x + 12*(idx[j]);
1100: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1102: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1103: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1104: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1105: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1106: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1107: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1108: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1109: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1110: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1111: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1112: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1113: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1114: v += 48;
1116: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1118: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1119: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1120: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1121: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1122: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1123: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1124: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1125: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1126: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1127: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1128: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1129: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1130: v += 48;
1132: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1133: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1134: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1135: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1136: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1137: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1138: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1139: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1140: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1141: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1142: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1143: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1144: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1145: v += 48;
1147: }
1148: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1149: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1150: if (!usecprow) {
1151: y += 12;
1152: z += 12;
1153: }
1154: }
1155: VecRestoreArrayRead(xx,&x);
1156: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1157: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1158: return 0;
1159: }
1161: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1162: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1163: {
1164: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1165: PetscScalar *z = NULL,*zarray;
1166: const PetscScalar *x,*work;
1167: const MatScalar *v = a->a;
1168: PetscInt mbs,i,j,n;
1169: const PetscInt *idx = a->j,*ii,*ridx=NULL;
1170: PetscBool usecprow=a->compressedrow.use;
1171: const PetscInt bs = 12, bs2 = 144;
1173: __m256d a0,a1,a2,a3,a4,a5;
1174: __m256d w0,w1,w2,w3;
1175: __m256d z0,z1,z2;
1177: VecGetArrayRead(xx,&x);
1178: VecGetArrayWrite(zz,&zarray);
1180: if (usecprow) {
1181: mbs = a->compressedrow.nrows;
1182: ii = a->compressedrow.i;
1183: ridx = a->compressedrow.rindex;
1184: PetscArrayzero(zarray,bs*a->mbs);
1185: } else {
1186: mbs = a->mbs;
1187: ii = a->i;
1188: z = zarray;
1189: }
1191: for (i=0; i<mbs; i++) {
1192: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1194: n = ii[1] - ii[0]; ii++;
1195: for (j=0; j<n; j++) {
1196: work = x + bs*(*idx++);
1198: /* first column of a */
1199: w0 = _mm256_set1_pd(work[0]);
1200: a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1201: a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1202: a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1204: /* second column of a */
1205: w1 = _mm256_set1_pd(work[1]);
1206: a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1207: a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1208: a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1210: /* third column of a */
1211: w2 = _mm256_set1_pd(work[2]);
1212: a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1213: a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1214: a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1216: /* fourth column of a */
1217: w3 = _mm256_set1_pd(work[3]);
1218: a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1219: a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1220: a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1222: /* fifth column of a */
1223: w0 = _mm256_set1_pd(work[4]);
1224: a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1225: a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1226: a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1228: /* sixth column of a */
1229: w1 = _mm256_set1_pd(work[5]);
1230: a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1231: a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1232: a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1234: /* seventh column of a */
1235: w2 = _mm256_set1_pd(work[6]);
1236: a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1237: a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1238: a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1240: /* eighth column of a */
1241: w3 = _mm256_set1_pd(work[7]);
1242: a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1243: a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1244: a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1246: /* ninth column of a */
1247: w0 = _mm256_set1_pd(work[8]);
1248: a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1249: a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1250: a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1252: /* tenth column of a */
1253: w1 = _mm256_set1_pd(work[9]);
1254: a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1255: a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1256: a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1258: /* eleventh column of a */
1259: w2 = _mm256_set1_pd(work[10]);
1260: a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1261: a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1262: a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1264: /* twelveth column of a */
1265: w3 = _mm256_set1_pd(work[11]);
1266: a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1267: a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1268: a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1270: v += bs2;
1271: }
1272: if (usecprow) z = zarray + bs*ridx[i];
1273: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1274: if (!usecprow) z += bs;
1275: }
1276: VecRestoreArrayRead(xx,&x);
1277: VecRestoreArrayWrite(zz,&zarray);
1278: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1279: return 0;
1280: }
1281: #endif
1283: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1284: /* Default MatMult for block size 15 */
1285: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1286: {
1287: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1288: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1289: const PetscScalar *x,*xb;
1290: PetscScalar *zarray,xv;
1291: const MatScalar *v;
1292: const PetscInt *ii,*ij=a->j,*idx;
1293: PetscInt mbs,i,j,k,n,*ridx=NULL;
1294: PetscBool usecprow=a->compressedrow.use;
1296: VecGetArrayRead(xx,&x);
1297: VecGetArrayWrite(zz,&zarray);
1299: v = a->a;
1300: if (usecprow) {
1301: mbs = a->compressedrow.nrows;
1302: ii = a->compressedrow.i;
1303: ridx = a->compressedrow.rindex;
1304: PetscArrayzero(zarray,15*a->mbs);
1305: } else {
1306: mbs = a->mbs;
1307: ii = a->i;
1308: z = zarray;
1309: }
1311: for (i=0; i<mbs; i++) {
1312: n = ii[i+1] - ii[i];
1313: idx = ij + ii[i];
1314: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1315: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1317: for (j=0; j<n; j++) {
1318: xb = x + 15*(idx[j]);
1320: for (k=0; k<15; k++) {
1321: xv = xb[k];
1322: sum1 += v[0]*xv;
1323: sum2 += v[1]*xv;
1324: sum3 += v[2]*xv;
1325: sum4 += v[3]*xv;
1326: sum5 += v[4]*xv;
1327: sum6 += v[5]*xv;
1328: sum7 += v[6]*xv;
1329: sum8 += v[7]*xv;
1330: sum9 += v[8]*xv;
1331: sum10 += v[9]*xv;
1332: sum11 += v[10]*xv;
1333: sum12 += v[11]*xv;
1334: sum13 += v[12]*xv;
1335: sum14 += v[13]*xv;
1336: sum15 += v[14]*xv;
1337: v += 15;
1338: }
1339: }
1340: if (usecprow) z = zarray + 15*ridx[i];
1341: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1342: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1344: if (!usecprow) z += 15;
1345: }
1347: VecRestoreArrayRead(xx,&x);
1348: VecRestoreArrayWrite(zz,&zarray);
1349: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1350: return 0;
1351: }
1353: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1354: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1355: {
1356: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1357: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1358: const PetscScalar *x,*xb;
1359: PetscScalar x1,x2,x3,x4,*zarray;
1360: const MatScalar *v;
1361: const PetscInt *ii,*ij=a->j,*idx;
1362: PetscInt mbs,i,j,n,*ridx=NULL;
1363: PetscBool usecprow=a->compressedrow.use;
1365: VecGetArrayRead(xx,&x);
1366: VecGetArrayWrite(zz,&zarray);
1368: v = a->a;
1369: if (usecprow) {
1370: mbs = a->compressedrow.nrows;
1371: ii = a->compressedrow.i;
1372: ridx = a->compressedrow.rindex;
1373: PetscArrayzero(zarray,15*a->mbs);
1374: } else {
1375: mbs = a->mbs;
1376: ii = a->i;
1377: z = zarray;
1378: }
1380: for (i=0; i<mbs; i++) {
1381: n = ii[i+1] - ii[i];
1382: idx = ij + ii[i];
1383: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1384: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1386: for (j=0; j<n; j++) {
1387: xb = x + 15*(idx[j]);
1388: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1390: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1391: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1392: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1393: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1394: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1395: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1396: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1397: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1398: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1399: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1400: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1401: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1402: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1403: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1404: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1406: v += 60;
1408: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1410: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1411: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1412: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1413: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1414: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1415: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1416: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1417: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1418: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1419: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1420: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1421: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1422: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1423: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1424: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1425: v += 60;
1427: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1428: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1429: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1430: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1431: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1432: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1433: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1434: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1435: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1436: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1437: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1438: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1439: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1440: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1441: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1442: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1443: v += 60;
1445: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1446: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1447: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1448: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1449: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1450: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1451: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1452: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1453: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1454: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1455: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1456: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1457: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1458: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1459: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1460: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1461: v += 45;
1462: }
1463: if (usecprow) z = zarray + 15*ridx[i];
1464: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1465: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1467: if (!usecprow) z += 15;
1468: }
1470: VecRestoreArrayRead(xx,&x);
1471: VecRestoreArrayWrite(zz,&zarray);
1472: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1473: return 0;
1474: }
1476: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1477: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1478: {
1479: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1480: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1481: const PetscScalar *x,*xb;
1482: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1483: const MatScalar *v;
1484: const PetscInt *ii,*ij=a->j,*idx;
1485: PetscInt mbs,i,j,n,*ridx=NULL;
1486: PetscBool usecprow=a->compressedrow.use;
1488: VecGetArrayRead(xx,&x);
1489: VecGetArrayWrite(zz,&zarray);
1491: v = a->a;
1492: if (usecprow) {
1493: mbs = a->compressedrow.nrows;
1494: ii = a->compressedrow.i;
1495: ridx = a->compressedrow.rindex;
1496: PetscArrayzero(zarray,15*a->mbs);
1497: } else {
1498: mbs = a->mbs;
1499: ii = a->i;
1500: z = zarray;
1501: }
1503: for (i=0; i<mbs; i++) {
1504: n = ii[i+1] - ii[i];
1505: idx = ij + ii[i];
1506: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1507: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1509: for (j=0; j<n; j++) {
1510: xb = x + 15*(idx[j]);
1511: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1512: x8 = xb[7];
1514: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1515: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1516: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1517: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1518: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1519: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1520: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1521: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1522: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1523: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1524: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1525: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1526: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1527: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1528: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1529: v += 120;
1531: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1533: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1534: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1535: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1536: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1537: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1538: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1539: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1540: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1541: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1542: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1543: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1544: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1545: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1546: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1547: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1548: v += 105;
1549: }
1550: if (usecprow) z = zarray + 15*ridx[i];
1551: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1552: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1554: if (!usecprow) z += 15;
1555: }
1557: VecRestoreArrayRead(xx,&x);
1558: VecRestoreArrayWrite(zz,&zarray);
1559: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1560: return 0;
1561: }
1563: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1564: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1565: {
1566: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1567: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1568: const PetscScalar *x,*xb;
1569: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1570: const MatScalar *v;
1571: const PetscInt *ii,*ij=a->j,*idx;
1572: PetscInt mbs,i,j,n,*ridx=NULL;
1573: PetscBool usecprow=a->compressedrow.use;
1575: VecGetArrayRead(xx,&x);
1576: VecGetArrayWrite(zz,&zarray);
1578: v = a->a;
1579: if (usecprow) {
1580: mbs = a->compressedrow.nrows;
1581: ii = a->compressedrow.i;
1582: ridx = a->compressedrow.rindex;
1583: PetscArrayzero(zarray,15*a->mbs);
1584: } else {
1585: mbs = a->mbs;
1586: ii = a->i;
1587: z = zarray;
1588: }
1590: for (i=0; i<mbs; i++) {
1591: n = ii[i+1] - ii[i];
1592: idx = ij + ii[i];
1593: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1594: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1596: for (j=0; j<n; j++) {
1597: xb = x + 15*(idx[j]);
1598: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1599: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1601: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1602: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1603: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1604: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1605: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1606: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1607: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1608: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1609: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1610: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1611: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1612: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1613: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1614: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1615: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1616: v += 225;
1617: }
1618: if (usecprow) z = zarray + 15*ridx[i];
1619: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1620: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1622: if (!usecprow) z += 15;
1623: }
1625: VecRestoreArrayRead(xx,&x);
1626: VecRestoreArrayWrite(zz,&zarray);
1627: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1628: return 0;
1629: }
1631: /*
1632: This will not work with MatScalar == float because it calls the BLAS
1633: */
1634: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1635: {
1636: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1637: PetscScalar *z = NULL,*work,*workt,*zarray;
1638: const PetscScalar *x,*xb;
1639: const MatScalar *v;
1640: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1641: const PetscInt *idx,*ii,*ridx=NULL;
1642: PetscInt ncols,k;
1643: PetscBool usecprow=a->compressedrow.use;
1645: VecGetArrayRead(xx,&x);
1646: VecGetArrayWrite(zz,&zarray);
1648: idx = a->j;
1649: v = a->a;
1650: if (usecprow) {
1651: mbs = a->compressedrow.nrows;
1652: ii = a->compressedrow.i;
1653: ridx = a->compressedrow.rindex;
1654: PetscArrayzero(zarray,bs*a->mbs);
1655: } else {
1656: mbs = a->mbs;
1657: ii = a->i;
1658: z = zarray;
1659: }
1661: if (!a->mult_work) {
1662: k = PetscMax(A->rmap->n,A->cmap->n);
1663: PetscMalloc1(k+1,&a->mult_work);
1664: }
1665: work = a->mult_work;
1666: for (i=0; i<mbs; i++) {
1667: n = ii[1] - ii[0]; ii++;
1668: ncols = n*bs;
1669: workt = work;
1670: for (j=0; j<n; j++) {
1671: xb = x + bs*(*idx++);
1672: for (k=0; k<bs; k++) workt[k] = xb[k];
1673: workt += bs;
1674: }
1675: if (usecprow) z = zarray + bs*ridx[i];
1676: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1677: v += n*bs2;
1678: if (!usecprow) z += bs;
1679: }
1680: VecRestoreArrayRead(xx,&x);
1681: VecRestoreArrayWrite(zz,&zarray);
1682: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1683: return 0;
1684: }
1686: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1687: {
1688: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1689: const PetscScalar *x;
1690: PetscScalar *y,*z,sum;
1691: const MatScalar *v;
1692: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1693: const PetscInt *idx,*ii;
1694: PetscBool usecprow=a->compressedrow.use;
1696: VecGetArrayRead(xx,&x);
1697: VecGetArrayPair(yy,zz,&y,&z);
1699: idx = a->j;
1700: v = a->a;
1701: if (usecprow) {
1702: if (zz != yy) {
1703: PetscArraycpy(z,y,mbs);
1704: }
1705: mbs = a->compressedrow.nrows;
1706: ii = a->compressedrow.i;
1707: ridx = a->compressedrow.rindex;
1708: } else {
1709: ii = a->i;
1710: }
1712: for (i=0; i<mbs; i++) {
1713: n = ii[1] - ii[0];
1714: ii++;
1715: if (!usecprow) {
1716: sum = y[i];
1717: } else {
1718: sum = y[ridx[i]];
1719: }
1720: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1721: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1722: PetscSparseDensePlusDot(sum,x,v,idx,n);
1723: v += n;
1724: idx += n;
1725: if (usecprow) {
1726: z[ridx[i]] = sum;
1727: } else {
1728: z[i] = sum;
1729: }
1730: }
1731: VecRestoreArrayRead(xx,&x);
1732: VecRestoreArrayPair(yy,zz,&y,&z);
1733: PetscLogFlops(2.0*a->nz);
1734: return 0;
1735: }
1737: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1738: {
1739: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1740: PetscScalar *y = NULL,*z = NULL,sum1,sum2;
1741: const PetscScalar *x,*xb;
1742: PetscScalar x1,x2,*yarray,*zarray;
1743: const MatScalar *v;
1744: PetscInt mbs = a->mbs,i,n,j;
1745: const PetscInt *idx,*ii,*ridx = NULL;
1746: PetscBool usecprow = a->compressedrow.use;
1748: VecGetArrayRead(xx,&x);
1749: VecGetArrayPair(yy,zz,&yarray,&zarray);
1751: idx = a->j;
1752: v = a->a;
1753: if (usecprow) {
1754: if (zz != yy) {
1755: PetscArraycpy(zarray,yarray,2*mbs);
1756: }
1757: mbs = a->compressedrow.nrows;
1758: ii = a->compressedrow.i;
1759: ridx = a->compressedrow.rindex;
1760: } else {
1761: ii = a->i;
1762: y = yarray;
1763: z = zarray;
1764: }
1766: for (i=0; i<mbs; i++) {
1767: n = ii[1] - ii[0]; ii++;
1768: if (usecprow) {
1769: z = zarray + 2*ridx[i];
1770: y = yarray + 2*ridx[i];
1771: }
1772: sum1 = y[0]; sum2 = y[1];
1773: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1774: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1775: for (j=0; j<n; j++) {
1776: xb = x + 2*(*idx++);
1777: x1 = xb[0];
1778: x2 = xb[1];
1780: sum1 += v[0]*x1 + v[2]*x2;
1781: sum2 += v[1]*x1 + v[3]*x2;
1782: v += 4;
1783: }
1784: z[0] = sum1; z[1] = sum2;
1785: if (!usecprow) {
1786: z += 2; y += 2;
1787: }
1788: }
1789: VecRestoreArrayRead(xx,&x);
1790: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1791: PetscLogFlops(4.0*a->nz);
1792: return 0;
1793: }
1795: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1796: {
1797: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1798: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1799: const PetscScalar *x,*xb;
1800: const MatScalar *v;
1801: PetscInt mbs = a->mbs,i,j,n;
1802: const PetscInt *idx,*ii,*ridx = NULL;
1803: PetscBool usecprow = a->compressedrow.use;
1805: VecGetArrayRead(xx,&x);
1806: VecGetArrayPair(yy,zz,&yarray,&zarray);
1808: idx = a->j;
1809: v = a->a;
1810: if (usecprow) {
1811: if (zz != yy) {
1812: PetscArraycpy(zarray,yarray,3*mbs);
1813: }
1814: mbs = a->compressedrow.nrows;
1815: ii = a->compressedrow.i;
1816: ridx = a->compressedrow.rindex;
1817: } else {
1818: ii = a->i;
1819: y = yarray;
1820: z = zarray;
1821: }
1823: for (i=0; i<mbs; i++) {
1824: n = ii[1] - ii[0]; ii++;
1825: if (usecprow) {
1826: z = zarray + 3*ridx[i];
1827: y = yarray + 3*ridx[i];
1828: }
1829: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1830: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1831: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1832: for (j=0; j<n; j++) {
1833: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1834: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1835: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1836: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1837: v += 9;
1838: }
1839: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1840: if (!usecprow) {
1841: z += 3; y += 3;
1842: }
1843: }
1844: VecRestoreArrayRead(xx,&x);
1845: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1846: PetscLogFlops(18.0*a->nz);
1847: return 0;
1848: }
1850: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1851: {
1852: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1853: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1854: const PetscScalar *x,*xb;
1855: const MatScalar *v;
1856: PetscInt mbs = a->mbs,i,j,n;
1857: const PetscInt *idx,*ii,*ridx=NULL;
1858: PetscBool usecprow=a->compressedrow.use;
1860: VecGetArrayRead(xx,&x);
1861: VecGetArrayPair(yy,zz,&yarray,&zarray);
1863: idx = a->j;
1864: v = a->a;
1865: if (usecprow) {
1866: if (zz != yy) {
1867: PetscArraycpy(zarray,yarray,4*mbs);
1868: }
1869: mbs = a->compressedrow.nrows;
1870: ii = a->compressedrow.i;
1871: ridx = a->compressedrow.rindex;
1872: } else {
1873: ii = a->i;
1874: y = yarray;
1875: z = zarray;
1876: }
1878: for (i=0; i<mbs; i++) {
1879: n = ii[1] - ii[0]; ii++;
1880: if (usecprow) {
1881: z = zarray + 4*ridx[i];
1882: y = yarray + 4*ridx[i];
1883: }
1884: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1885: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1886: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1887: for (j=0; j<n; j++) {
1888: xb = x + 4*(*idx++);
1889: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1890: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1891: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1892: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1893: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1894: v += 16;
1895: }
1896: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1897: if (!usecprow) {
1898: z += 4; y += 4;
1899: }
1900: }
1901: VecRestoreArrayRead(xx,&x);
1902: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1903: PetscLogFlops(32.0*a->nz);
1904: return 0;
1905: }
1907: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1908: {
1909: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1910: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1911: const PetscScalar *x,*xb;
1912: PetscScalar *yarray,*zarray;
1913: const MatScalar *v;
1914: PetscInt mbs = a->mbs,i,j,n;
1915: const PetscInt *idx,*ii,*ridx = NULL;
1916: PetscBool usecprow=a->compressedrow.use;
1918: VecGetArrayRead(xx,&x);
1919: VecGetArrayPair(yy,zz,&yarray,&zarray);
1921: idx = a->j;
1922: v = a->a;
1923: if (usecprow) {
1924: if (zz != yy) {
1925: PetscArraycpy(zarray,yarray,5*mbs);
1926: }
1927: mbs = a->compressedrow.nrows;
1928: ii = a->compressedrow.i;
1929: ridx = a->compressedrow.rindex;
1930: } else {
1931: ii = a->i;
1932: y = yarray;
1933: z = zarray;
1934: }
1936: for (i=0; i<mbs; i++) {
1937: n = ii[1] - ii[0]; ii++;
1938: if (usecprow) {
1939: z = zarray + 5*ridx[i];
1940: y = yarray + 5*ridx[i];
1941: }
1942: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1943: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1944: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1945: for (j=0; j<n; j++) {
1946: xb = x + 5*(*idx++);
1947: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1948: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1949: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1950: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1951: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1952: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1953: v += 25;
1954: }
1955: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1956: if (!usecprow) {
1957: z += 5; y += 5;
1958: }
1959: }
1960: VecRestoreArrayRead(xx,&x);
1961: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1962: PetscLogFlops(50.0*a->nz);
1963: return 0;
1964: }
1966: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1967: {
1968: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1969: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
1970: const PetscScalar *x,*xb;
1971: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1972: const MatScalar *v;
1973: PetscInt mbs = a->mbs,i,j,n;
1974: const PetscInt *idx,*ii,*ridx=NULL;
1975: PetscBool usecprow=a->compressedrow.use;
1977: VecGetArrayRead(xx,&x);
1978: VecGetArrayPair(yy,zz,&yarray,&zarray);
1980: idx = a->j;
1981: v = a->a;
1982: if (usecprow) {
1983: if (zz != yy) {
1984: PetscArraycpy(zarray,yarray,6*mbs);
1985: }
1986: mbs = a->compressedrow.nrows;
1987: ii = a->compressedrow.i;
1988: ridx = a->compressedrow.rindex;
1989: } else {
1990: ii = a->i;
1991: y = yarray;
1992: z = zarray;
1993: }
1995: for (i=0; i<mbs; i++) {
1996: n = ii[1] - ii[0]; ii++;
1997: if (usecprow) {
1998: z = zarray + 6*ridx[i];
1999: y = yarray + 6*ridx[i];
2000: }
2001: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2002: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2003: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2004: for (j=0; j<n; j++) {
2005: xb = x + 6*(*idx++);
2006: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2007: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2008: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2009: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2010: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2011: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2012: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2013: v += 36;
2014: }
2015: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2016: if (!usecprow) {
2017: z += 6; y += 6;
2018: }
2019: }
2020: VecRestoreArrayRead(xx,&x);
2021: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2022: PetscLogFlops(72.0*a->nz);
2023: return 0;
2024: }
2026: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2027: {
2028: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2029: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2030: const PetscScalar *x,*xb;
2031: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2032: const MatScalar *v;
2033: PetscInt mbs = a->mbs,i,j,n;
2034: const PetscInt *idx,*ii,*ridx = NULL;
2035: PetscBool usecprow=a->compressedrow.use;
2037: VecGetArrayRead(xx,&x);
2038: VecGetArrayPair(yy,zz,&yarray,&zarray);
2040: idx = a->j;
2041: v = a->a;
2042: if (usecprow) {
2043: if (zz != yy) {
2044: PetscArraycpy(zarray,yarray,7*mbs);
2045: }
2046: mbs = a->compressedrow.nrows;
2047: ii = a->compressedrow.i;
2048: ridx = a->compressedrow.rindex;
2049: } else {
2050: ii = a->i;
2051: y = yarray;
2052: z = zarray;
2053: }
2055: for (i=0; i<mbs; i++) {
2056: n = ii[1] - ii[0]; ii++;
2057: if (usecprow) {
2058: z = zarray + 7*ridx[i];
2059: y = yarray + 7*ridx[i];
2060: }
2061: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2062: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2063: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2064: for (j=0; j<n; j++) {
2065: xb = x + 7*(*idx++);
2066: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2067: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2068: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2069: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2070: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2071: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2072: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2073: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2074: v += 49;
2075: }
2076: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2077: if (!usecprow) {
2078: z += 7; y += 7;
2079: }
2080: }
2081: VecRestoreArrayRead(xx,&x);
2082: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2083: PetscLogFlops(98.0*a->nz);
2084: return 0;
2085: }
2087: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2088: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2089: {
2090: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2091: PetscScalar *z = NULL,*work,*workt,*zarray;
2092: const PetscScalar *x,*xb;
2093: const MatScalar *v;
2094: PetscInt mbs,i,j,n;
2095: PetscInt k;
2096: PetscBool usecprow=a->compressedrow.use;
2097: const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2099: __m256d a0,a1,a2,a3,a4,a5;
2100: __m256d w0,w1,w2,w3;
2101: __m256d z0,z1,z2;
2102: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2104: VecCopy(yy,zz);
2105: VecGetArrayRead(xx,&x);
2106: VecGetArray(zz,&zarray);
2108: idx = a->j;
2109: v = a->a;
2110: if (usecprow) {
2111: mbs = a->compressedrow.nrows;
2112: ii = a->compressedrow.i;
2113: ridx = a->compressedrow.rindex;
2114: } else {
2115: mbs = a->mbs;
2116: ii = a->i;
2117: z = zarray;
2118: }
2120: if (!a->mult_work) {
2121: k = PetscMax(A->rmap->n,A->cmap->n);
2122: PetscMalloc1(k+1,&a->mult_work);
2123: }
2125: work = a->mult_work;
2126: for (i=0; i<mbs; i++) {
2127: n = ii[1] - ii[0]; ii++;
2128: workt = work;
2129: for (j=0; j<n; j++) {
2130: xb = x + bs*(*idx++);
2131: for (k=0; k<bs; k++) workt[k] = xb[k];
2132: workt += bs;
2133: }
2134: if (usecprow) z = zarray + bs*ridx[i];
2136: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2138: for (j=0; j<n; j++) {
2139: /* first column of a */
2140: w0 = _mm256_set1_pd(work[j*9 ]);
2141: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2142: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2143: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2145: /* second column of a */
2146: w1 = _mm256_set1_pd(work[j*9+ 1]);
2147: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2148: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2149: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2151: /* third column of a */
2152: w2 = _mm256_set1_pd(work[j*9+ 2]);
2153: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2154: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2155: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2157: /* fourth column of a */
2158: w3 = _mm256_set1_pd(work[j*9+ 3]);
2159: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2160: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2161: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2163: /* fifth column of a */
2164: w0 = _mm256_set1_pd(work[j*9+ 4]);
2165: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2166: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2167: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2169: /* sixth column of a */
2170: w1 = _mm256_set1_pd(work[j*9+ 5]);
2171: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2172: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2173: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2175: /* seventh column of a */
2176: w2 = _mm256_set1_pd(work[j*9+ 6]);
2177: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2178: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2179: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2181: /* eighth column of a */
2182: w3 = _mm256_set1_pd(work[j*9+ 7]);
2183: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2184: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2185: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2187: /* ninth column of a */
2188: w0 = _mm256_set1_pd(work[j*9+ 8]);
2189: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2190: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2191: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2192: }
2194: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2196: v += n*bs2;
2197: if (!usecprow) z += bs;
2198: }
2199: VecRestoreArrayRead(xx,&x);
2200: VecRestoreArray(zz,&zarray);
2201: PetscLogFlops(162.0*a->nz);
2202: return 0;
2203: }
2204: #endif
2206: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2207: {
2208: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2209: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2210: const PetscScalar *x,*xb;
2211: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2212: const MatScalar *v;
2213: PetscInt mbs = a->mbs,i,j,n;
2214: const PetscInt *idx,*ii,*ridx = NULL;
2215: PetscBool usecprow=a->compressedrow.use;
2217: VecGetArrayRead(xx,&x);
2218: VecGetArrayPair(yy,zz,&yarray,&zarray);
2220: idx = a->j;
2221: v = a->a;
2222: if (usecprow) {
2223: if (zz != yy) {
2224: PetscArraycpy(zarray,yarray,7*mbs);
2225: }
2226: mbs = a->compressedrow.nrows;
2227: ii = a->compressedrow.i;
2228: ridx = a->compressedrow.rindex;
2229: } else {
2230: ii = a->i;
2231: y = yarray;
2232: z = zarray;
2233: }
2235: for (i=0; i<mbs; i++) {
2236: n = ii[1] - ii[0]; ii++;
2237: if (usecprow) {
2238: z = zarray + 11*ridx[i];
2239: y = yarray + 11*ridx[i];
2240: }
2241: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2242: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2243: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2244: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2245: for (j=0; j<n; j++) {
2246: xb = x + 11*(*idx++);
2247: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2248: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
2249: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
2250: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
2251: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
2252: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
2253: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
2254: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
2255: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
2256: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
2257: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
2258: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
2259: v += 121;
2260: }
2261: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2262: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2263: if (!usecprow) {
2264: z += 11; y += 11;
2265: }
2266: }
2267: VecRestoreArrayRead(xx,&x);
2268: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2269: PetscLogFlops(242.0*a->nz);
2270: return 0;
2271: }
2273: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2274: {
2275: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2276: PetscScalar *z = NULL,*work,*workt,*zarray;
2277: const PetscScalar *x,*xb;
2278: const MatScalar *v;
2279: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2280: PetscInt ncols,k;
2281: const PetscInt *ridx = NULL,*idx,*ii;
2282: PetscBool usecprow = a->compressedrow.use;
2284: VecCopy(yy,zz);
2285: VecGetArrayRead(xx,&x);
2286: VecGetArray(zz,&zarray);
2288: idx = a->j;
2289: v = a->a;
2290: if (usecprow) {
2291: mbs = a->compressedrow.nrows;
2292: ii = a->compressedrow.i;
2293: ridx = a->compressedrow.rindex;
2294: } else {
2295: mbs = a->mbs;
2296: ii = a->i;
2297: z = zarray;
2298: }
2300: if (!a->mult_work) {
2301: k = PetscMax(A->rmap->n,A->cmap->n);
2302: PetscMalloc1(k+1,&a->mult_work);
2303: }
2304: work = a->mult_work;
2305: for (i=0; i<mbs; i++) {
2306: n = ii[1] - ii[0]; ii++;
2307: ncols = n*bs;
2308: workt = work;
2309: for (j=0; j<n; j++) {
2310: xb = x + bs*(*idx++);
2311: for (k=0; k<bs; k++) workt[k] = xb[k];
2312: workt += bs;
2313: }
2314: if (usecprow) z = zarray + bs*ridx[i];
2315: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2316: v += n*bs2;
2317: if (!usecprow) z += bs;
2318: }
2319: VecRestoreArrayRead(xx,&x);
2320: VecRestoreArray(zz,&zarray);
2321: PetscLogFlops(2.0*a->nz*bs2);
2322: return 0;
2323: }
2325: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2326: {
2327: PetscScalar zero = 0.0;
2329: VecSet(zz,zero);
2330: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2331: return 0;
2332: }
2334: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2335: {
2336: PetscScalar zero = 0.0;
2338: VecSet(zz,zero);
2339: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2340: return 0;
2341: }
2343: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2344: {
2345: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2346: PetscScalar *z,x1,x2,x3,x4,x5;
2347: const PetscScalar *x,*xb = NULL;
2348: const MatScalar *v;
2349: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
2350: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2351: Mat_CompressedRow cprow = a->compressedrow;
2352: PetscBool usecprow = cprow.use;
2354: if (yy != zz) VecCopy(yy,zz);
2355: VecGetArrayRead(xx,&x);
2356: VecGetArray(zz,&z);
2358: idx = a->j;
2359: v = a->a;
2360: if (usecprow) {
2361: mbs = cprow.nrows;
2362: ii = cprow.i;
2363: ridx = cprow.rindex;
2364: } else {
2365: mbs=a->mbs;
2366: ii = a->i;
2367: xb = x;
2368: }
2370: switch (bs) {
2371: case 1:
2372: for (i=0; i<mbs; i++) {
2373: if (usecprow) xb = x + ridx[i];
2374: x1 = xb[0];
2375: ib = idx + ii[0];
2376: n = ii[1] - ii[0]; ii++;
2377: for (j=0; j<n; j++) {
2378: rval = ib[j];
2379: z[rval] += PetscConj(*v) * x1;
2380: v++;
2381: }
2382: if (!usecprow) xb++;
2383: }
2384: break;
2385: case 2:
2386: for (i=0; i<mbs; i++) {
2387: if (usecprow) xb = x + 2*ridx[i];
2388: x1 = xb[0]; x2 = xb[1];
2389: ib = idx + ii[0];
2390: n = ii[1] - ii[0]; ii++;
2391: for (j=0; j<n; j++) {
2392: rval = ib[j]*2;
2393: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2394: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2395: v += 4;
2396: }
2397: if (!usecprow) xb += 2;
2398: }
2399: break;
2400: case 3:
2401: for (i=0; i<mbs; i++) {
2402: if (usecprow) xb = x + 3*ridx[i];
2403: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2404: ib = idx + ii[0];
2405: n = ii[1] - ii[0]; ii++;
2406: for (j=0; j<n; j++) {
2407: rval = ib[j]*3;
2408: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2409: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2410: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2411: v += 9;
2412: }
2413: if (!usecprow) xb += 3;
2414: }
2415: break;
2416: case 4:
2417: for (i=0; i<mbs; i++) {
2418: if (usecprow) xb = x + 4*ridx[i];
2419: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2420: ib = idx + ii[0];
2421: n = ii[1] - ii[0]; ii++;
2422: for (j=0; j<n; j++) {
2423: rval = ib[j]*4;
2424: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2425: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2426: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2427: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2428: v += 16;
2429: }
2430: if (!usecprow) xb += 4;
2431: }
2432: break;
2433: case 5:
2434: for (i=0; i<mbs; i++) {
2435: if (usecprow) xb = x + 5*ridx[i];
2436: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2437: x4 = xb[3]; x5 = xb[4];
2438: ib = idx + ii[0];
2439: n = ii[1] - ii[0]; ii++;
2440: for (j=0; j<n; j++) {
2441: rval = ib[j]*5;
2442: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2443: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2444: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2445: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2446: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2447: v += 25;
2448: }
2449: if (!usecprow) xb += 5;
2450: }
2451: break;
2452: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2453: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2454: #if 0
2455: {
2456: PetscInt ncols,k,bs2=a->bs2;
2457: PetscScalar *work,*workt,zb;
2458: const PetscScalar *xtmp;
2459: if (!a->mult_work) {
2460: k = PetscMax(A->rmap->n,A->cmap->n);
2461: PetscMalloc1(k+1,&a->mult_work);
2462: }
2463: work = a->mult_work;
2464: xtmp = x;
2465: for (i=0; i<mbs; i++) {
2466: n = ii[1] - ii[0]; ii++;
2467: ncols = n*bs;
2468: PetscArrayzero(work,ncols);
2469: if (usecprow) xtmp = x + bs*ridx[i];
2470: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2471: v += n*bs2;
2472: if (!usecprow) xtmp += bs;
2473: workt = work;
2474: for (j=0; j<n; j++) {
2475: zb = z + bs*(*idx++);
2476: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2477: workt += bs;
2478: }
2479: }
2480: }
2481: #endif
2482: }
2483: VecRestoreArrayRead(xx,&x);
2484: VecRestoreArray(zz,&z);
2485: PetscLogFlops(2.0*a->nz*a->bs2);
2486: return 0;
2487: }
2489: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2490: {
2491: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2492: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2493: const PetscScalar *x,*xb = NULL;
2494: const MatScalar *v;
2495: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2496: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2497: Mat_CompressedRow cprow = a->compressedrow;
2498: PetscBool usecprow=cprow.use;
2500: if (yy != zz) VecCopy(yy,zz);
2501: VecGetArrayRead(xx,&x);
2502: VecGetArray(zz,&z);
2504: idx = a->j;
2505: v = a->a;
2506: if (usecprow) {
2507: mbs = cprow.nrows;
2508: ii = cprow.i;
2509: ridx = cprow.rindex;
2510: } else {
2511: mbs=a->mbs;
2512: ii = a->i;
2513: xb = x;
2514: }
2516: switch (bs) {
2517: case 1:
2518: for (i=0; i<mbs; i++) {
2519: if (usecprow) xb = x + ridx[i];
2520: x1 = xb[0];
2521: ib = idx + ii[0];
2522: n = ii[1] - ii[0]; ii++;
2523: for (j=0; j<n; j++) {
2524: rval = ib[j];
2525: z[rval] += *v * x1;
2526: v++;
2527: }
2528: if (!usecprow) xb++;
2529: }
2530: break;
2531: case 2:
2532: for (i=0; i<mbs; i++) {
2533: if (usecprow) xb = x + 2*ridx[i];
2534: x1 = xb[0]; x2 = xb[1];
2535: ib = idx + ii[0];
2536: n = ii[1] - ii[0]; ii++;
2537: for (j=0; j<n; j++) {
2538: rval = ib[j]*2;
2539: z[rval++] += v[0]*x1 + v[1]*x2;
2540: z[rval++] += v[2]*x1 + v[3]*x2;
2541: v += 4;
2542: }
2543: if (!usecprow) xb += 2;
2544: }
2545: break;
2546: case 3:
2547: for (i=0; i<mbs; i++) {
2548: if (usecprow) xb = x + 3*ridx[i];
2549: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2550: ib = idx + ii[0];
2551: n = ii[1] - ii[0]; ii++;
2552: for (j=0; j<n; j++) {
2553: rval = ib[j]*3;
2554: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2555: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2556: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2557: v += 9;
2558: }
2559: if (!usecprow) xb += 3;
2560: }
2561: break;
2562: case 4:
2563: for (i=0; i<mbs; i++) {
2564: if (usecprow) xb = x + 4*ridx[i];
2565: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2566: ib = idx + ii[0];
2567: n = ii[1] - ii[0]; ii++;
2568: for (j=0; j<n; j++) {
2569: rval = ib[j]*4;
2570: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2571: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2572: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2573: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2574: v += 16;
2575: }
2576: if (!usecprow) xb += 4;
2577: }
2578: break;
2579: case 5:
2580: for (i=0; i<mbs; i++) {
2581: if (usecprow) xb = x + 5*ridx[i];
2582: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2583: x4 = xb[3]; x5 = xb[4];
2584: ib = idx + ii[0];
2585: n = ii[1] - ii[0]; ii++;
2586: for (j=0; j<n; j++) {
2587: rval = ib[j]*5;
2588: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2589: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2590: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2591: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2592: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2593: v += 25;
2594: }
2595: if (!usecprow) xb += 5;
2596: }
2597: break;
2598: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2599: PetscInt ncols,k;
2600: PetscScalar *work,*workt;
2601: const PetscScalar *xtmp;
2602: if (!a->mult_work) {
2603: k = PetscMax(A->rmap->n,A->cmap->n);
2604: PetscMalloc1(k+1,&a->mult_work);
2605: }
2606: work = a->mult_work;
2607: xtmp = x;
2608: for (i=0; i<mbs; i++) {
2609: n = ii[1] - ii[0]; ii++;
2610: ncols = n*bs;
2611: PetscArrayzero(work,ncols);
2612: if (usecprow) xtmp = x + bs*ridx[i];
2613: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2614: v += n*bs2;
2615: if (!usecprow) xtmp += bs;
2616: workt = work;
2617: for (j=0; j<n; j++) {
2618: zb = z + bs*(*idx++);
2619: for (k=0; k<bs; k++) zb[k] += workt[k];
2620: workt += bs;
2621: }
2622: }
2623: }
2624: }
2625: VecRestoreArrayRead(xx,&x);
2626: VecRestoreArray(zz,&z);
2627: PetscLogFlops(2.0*a->nz*a->bs2);
2628: return 0;
2629: }
2631: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2632: {
2633: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2634: PetscInt totalnz = a->bs2*a->nz;
2635: PetscScalar oalpha = alpha;
2636: PetscBLASInt one = 1,tnz;
2638: PetscBLASIntCast(totalnz,&tnz);
2639: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2640: PetscLogFlops(totalnz);
2641: return 0;
2642: }
2644: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2645: {
2646: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2647: MatScalar *v = a->a;
2648: PetscReal sum = 0.0;
2649: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2651: if (type == NORM_FROBENIUS) {
2652: #if defined(PETSC_USE_REAL___FP16)
2653: PetscBLASInt one = 1,cnt = bs2*nz;
2654: PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2655: #else
2656: for (i=0; i<bs2*nz; i++) {
2657: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2658: }
2659: #endif
2660: *norm = PetscSqrtReal(sum);
2661: PetscLogFlops(2.0*bs2*nz);
2662: } else if (type == NORM_1) { /* maximum column sum */
2663: PetscReal *tmp;
2664: PetscInt *bcol = a->j;
2665: PetscCalloc1(A->cmap->n+1,&tmp);
2666: for (i=0; i<nz; i++) {
2667: for (j=0; j<bs; j++) {
2668: k1 = bs*(*bcol) + j; /* column index */
2669: for (k=0; k<bs; k++) {
2670: tmp[k1] += PetscAbsScalar(*v); v++;
2671: }
2672: }
2673: bcol++;
2674: }
2675: *norm = 0.0;
2676: for (j=0; j<A->cmap->n; j++) {
2677: if (tmp[j] > *norm) *norm = tmp[j];
2678: }
2679: PetscFree(tmp);
2680: PetscLogFlops(PetscMax(bs2*nz-1,0));
2681: } else if (type == NORM_INFINITY) { /* maximum row sum */
2682: *norm = 0.0;
2683: for (k=0; k<bs; k++) {
2684: for (j=0; j<a->mbs; j++) {
2685: v = a->a + bs2*a->i[j] + k;
2686: sum = 0.0;
2687: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2688: for (k1=0; k1<bs; k1++) {
2689: sum += PetscAbsScalar(*v);
2690: v += bs;
2691: }
2692: }
2693: if (sum > *norm) *norm = sum;
2694: }
2695: }
2696: PetscLogFlops(PetscMax(bs2*nz-1,0));
2697: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2698: return 0;
2699: }
2701: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2702: {
2703: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2705: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2706: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2707: *flg = PETSC_FALSE;
2708: return 0;
2709: }
2711: /* if the a->i are the same */
2712: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2713: if (!*flg) return 0;
2715: /* if a->j are the same */
2716: PetscArraycmp(a->j,b->j,a->nz,flg);
2717: if (!*flg) return 0;
2719: /* if a->a are the same */
2720: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2721: return 0;
2723: }
2725: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2726: {
2727: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2728: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2729: PetscScalar *x,zero = 0.0;
2730: MatScalar *aa,*aa_j;
2733: bs = A->rmap->bs;
2734: aa = a->a;
2735: ai = a->i;
2736: aj = a->j;
2737: ambs = a->mbs;
2738: bs2 = a->bs2;
2740: VecSet(v,zero);
2741: VecGetArray(v,&x);
2742: VecGetLocalSize(v,&n);
2744: for (i=0; i<ambs; i++) {
2745: for (j=ai[i]; j<ai[i+1]; j++) {
2746: if (aj[j] == i) {
2747: row = i*bs;
2748: aa_j = aa+j*bs2;
2749: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2750: break;
2751: }
2752: }
2753: }
2754: VecRestoreArray(v,&x);
2755: return 0;
2756: }
2758: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2759: {
2760: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2761: const PetscScalar *l,*r,*li,*ri;
2762: PetscScalar x;
2763: MatScalar *aa, *v;
2764: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2765: const PetscInt *ai,*aj;
2767: ai = a->i;
2768: aj = a->j;
2769: aa = a->a;
2770: m = A->rmap->n;
2771: n = A->cmap->n;
2772: bs = A->rmap->bs;
2773: mbs = a->mbs;
2774: bs2 = a->bs2;
2775: if (ll) {
2776: VecGetArrayRead(ll,&l);
2777: VecGetLocalSize(ll,&lm);
2779: for (i=0; i<mbs; i++) { /* for each block row */
2780: M = ai[i+1] - ai[i];
2781: li = l + i*bs;
2782: v = aa + bs2*ai[i];
2783: for (j=0; j<M; j++) { /* for each block */
2784: for (k=0; k<bs2; k++) {
2785: (*v++) *= li[k%bs];
2786: }
2787: }
2788: }
2789: VecRestoreArrayRead(ll,&l);
2790: PetscLogFlops(a->nz);
2791: }
2793: if (rr) {
2794: VecGetArrayRead(rr,&r);
2795: VecGetLocalSize(rr,&rn);
2797: for (i=0; i<mbs; i++) { /* for each block row */
2798: iai = ai[i];
2799: M = ai[i+1] - iai;
2800: v = aa + bs2*iai;
2801: for (j=0; j<M; j++) { /* for each block */
2802: ri = r + bs*aj[iai+j];
2803: for (k=0; k<bs; k++) {
2804: x = ri[k];
2805: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2806: v += bs;
2807: }
2808: }
2809: }
2810: VecRestoreArrayRead(rr,&r);
2811: PetscLogFlops(a->nz);
2812: }
2813: return 0;
2814: }
2816: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2817: {
2818: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2820: info->block_size = a->bs2;
2821: info->nz_allocated = a->bs2*a->maxnz;
2822: info->nz_used = a->bs2*a->nz;
2823: info->nz_unneeded = info->nz_allocated - info->nz_used;
2824: info->assemblies = A->num_ass;
2825: info->mallocs = A->info.mallocs;
2826: info->memory = ((PetscObject)A)->mem;
2827: if (A->factortype) {
2828: info->fill_ratio_given = A->info.fill_ratio_given;
2829: info->fill_ratio_needed = A->info.fill_ratio_needed;
2830: info->factor_mallocs = A->info.factor_mallocs;
2831: } else {
2832: info->fill_ratio_given = 0;
2833: info->fill_ratio_needed = 0;
2834: info->factor_mallocs = 0;
2835: }
2836: return 0;
2837: }
2839: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2840: {
2841: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2843: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2844: return 0;
2845: }
2847: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2848: {
2849: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2850: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2851: return 0;
2852: }
2854: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2855: {
2856: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2857: PetscScalar *z = NULL,sum1;
2858: const PetscScalar *xb;
2859: PetscScalar x1;
2860: const MatScalar *v,*vv;
2861: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2862: PetscBool usecprow=a->compressedrow.use;
2864: idx = a->j;
2865: v = a->a;
2866: if (usecprow) {
2867: mbs = a->compressedrow.nrows;
2868: ii = a->compressedrow.i;
2869: ridx = a->compressedrow.rindex;
2870: } else {
2871: mbs = a->mbs;
2872: ii = a->i;
2873: z = c;
2874: }
2876: for (i=0; i<mbs; i++) {
2877: n = ii[1] - ii[0]; ii++;
2878: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2879: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2880: if (usecprow) z = c + ridx[i];
2881: jj = idx;
2882: vv = v;
2883: for (k=0; k<cn; k++) {
2884: idx = jj;
2885: v = vv;
2886: sum1 = 0.0;
2887: for (j=0; j<n; j++) {
2888: xb = b + (*idx++); x1 = xb[0+k*bm];
2889: sum1 += v[0]*x1;
2890: v += 1;
2891: }
2892: z[0+k*cm] = sum1;
2893: }
2894: if (!usecprow) z += 1;
2895: }
2896: return 0;
2897: }
2899: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2900: {
2901: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2902: PetscScalar *z = NULL,sum1,sum2;
2903: const PetscScalar *xb;
2904: PetscScalar x1,x2;
2905: const MatScalar *v,*vv;
2906: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2907: PetscBool usecprow=a->compressedrow.use;
2909: idx = a->j;
2910: v = a->a;
2911: if (usecprow) {
2912: mbs = a->compressedrow.nrows;
2913: ii = a->compressedrow.i;
2914: ridx = a->compressedrow.rindex;
2915: } else {
2916: mbs = a->mbs;
2917: ii = a->i;
2918: z = c;
2919: }
2921: for (i=0; i<mbs; i++) {
2922: n = ii[1] - ii[0]; ii++;
2923: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2924: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2925: if (usecprow) z = c + 2*ridx[i];
2926: jj = idx;
2927: vv = v;
2928: for (k=0; k<cn; k++) {
2929: idx = jj;
2930: v = vv;
2931: sum1 = 0.0; sum2 = 0.0;
2932: for (j=0; j<n; j++) {
2933: xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2934: sum1 += v[0]*x1 + v[2]*x2;
2935: sum2 += v[1]*x1 + v[3]*x2;
2936: v += 4;
2937: }
2938: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2939: }
2940: if (!usecprow) z += 2;
2941: }
2942: return 0;
2943: }
2945: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2946: {
2947: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2948: PetscScalar *z = NULL,sum1,sum2,sum3;
2949: const PetscScalar *xb;
2950: PetscScalar x1,x2,x3;
2951: const MatScalar *v,*vv;
2952: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2953: PetscBool usecprow=a->compressedrow.use;
2955: idx = a->j;
2956: v = a->a;
2957: if (usecprow) {
2958: mbs = a->compressedrow.nrows;
2959: ii = a->compressedrow.i;
2960: ridx = a->compressedrow.rindex;
2961: } else {
2962: mbs = a->mbs;
2963: ii = a->i;
2964: z = c;
2965: }
2967: for (i=0; i<mbs; i++) {
2968: n = ii[1] - ii[0]; ii++;
2969: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2970: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2971: if (usecprow) z = c + 3*ridx[i];
2972: jj = idx;
2973: vv = v;
2974: for (k=0; k<cn; k++) {
2975: idx = jj;
2976: v = vv;
2977: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
2978: for (j=0; j<n; j++) {
2979: xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2980: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2981: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2982: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2983: v += 9;
2984: }
2985: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2986: }
2987: if (!usecprow) z += 3;
2988: }
2989: return 0;
2990: }
2992: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2993: {
2994: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2995: PetscScalar *z = NULL,sum1,sum2,sum3,sum4;
2996: const PetscScalar *xb;
2997: PetscScalar x1,x2,x3,x4;
2998: const MatScalar *v,*vv;
2999: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3000: PetscBool usecprow=a->compressedrow.use;
3002: idx = a->j;
3003: v = a->a;
3004: if (usecprow) {
3005: mbs = a->compressedrow.nrows;
3006: ii = a->compressedrow.i;
3007: ridx = a->compressedrow.rindex;
3008: } else {
3009: mbs = a->mbs;
3010: ii = a->i;
3011: z = c;
3012: }
3014: for (i=0; i<mbs; i++) {
3015: n = ii[1] - ii[0]; ii++;
3016: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3017: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3018: if (usecprow) z = c + 4*ridx[i];
3019: jj = idx;
3020: vv = v;
3021: for (k=0; k<cn; k++) {
3022: idx = jj;
3023: v = vv;
3024: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3025: for (j=0; j<n; j++) {
3026: xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3027: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3028: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3029: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3030: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3031: v += 16;
3032: }
3033: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3034: }
3035: if (!usecprow) z += 4;
3036: }
3037: return 0;
3038: }
3040: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3041: {
3042: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3043: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5;
3044: const PetscScalar *xb;
3045: PetscScalar x1,x2,x3,x4,x5;
3046: const MatScalar *v,*vv;
3047: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3048: PetscBool usecprow=a->compressedrow.use;
3050: idx = a->j;
3051: v = a->a;
3052: if (usecprow) {
3053: mbs = a->compressedrow.nrows;
3054: ii = a->compressedrow.i;
3055: ridx = a->compressedrow.rindex;
3056: } else {
3057: mbs = a->mbs;
3058: ii = a->i;
3059: z = c;
3060: }
3062: for (i=0; i<mbs; i++) {
3063: n = ii[1] - ii[0]; ii++;
3064: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3065: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3066: if (usecprow) z = c + 5*ridx[i];
3067: jj = idx;
3068: vv = v;
3069: for (k=0; k<cn; k++) {
3070: idx = jj;
3071: v = vv;
3072: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3073: for (j=0; j<n; j++) {
3074: xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3075: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3076: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3077: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3078: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3079: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3080: v += 25;
3081: }
3082: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3083: }
3084: if (!usecprow) z += 5;
3085: }
3086: return 0;
3087: }
3089: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3090: {
3091: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3092: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
3093: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
3094: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3095: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3096: PetscBLASInt bbs,bcn,bbm,bcm;
3097: PetscScalar *z = NULL;
3098: PetscScalar *c,*b;
3099: const MatScalar *v;
3100: const PetscInt *idx,*ii,*ridx=NULL;
3101: PetscScalar _DZero=0.0,_DOne=1.0;
3102: PetscBool usecprow=a->compressedrow.use;
3104: if (!cm || !cn) return 0;
3108: b = bd->v;
3109: if (a->nonzerorowcnt != A->rmap->n) {
3110: MatZeroEntries(C);
3111: }
3112: MatDenseGetArray(C,&c);
3113: switch (bs) {
3114: case 1:
3115: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3116: break;
3117: case 2:
3118: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3119: break;
3120: case 3:
3121: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3122: break;
3123: case 4:
3124: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3125: break;
3126: case 5:
3127: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3128: break;
3129: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3130: PetscBLASIntCast(bs,&bbs);
3131: PetscBLASIntCast(cn,&bcn);
3132: PetscBLASIntCast(bm,&bbm);
3133: PetscBLASIntCast(cm,&bcm);
3134: idx = a->j;
3135: v = a->a;
3136: if (usecprow) {
3137: mbs = a->compressedrow.nrows;
3138: ii = a->compressedrow.i;
3139: ridx = a->compressedrow.rindex;
3140: } else {
3141: mbs = a->mbs;
3142: ii = a->i;
3143: z = c;
3144: }
3145: for (i=0; i<mbs; i++) {
3146: n = ii[1] - ii[0]; ii++;
3147: if (usecprow) z = c + bs*ridx[i];
3148: if (n) {
3149: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3150: v += bs2;
3151: }
3152: for (j=1; j<n; j++) {
3153: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3154: v += bs2;
3155: }
3156: if (!usecprow) z += bs;
3157: }
3158: }
3159: MatDenseRestoreArray(C,&c);
3160: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3161: return 0;
3162: }