Actual source code: baij2.c
petsc-3.11.4 2019-09-28
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
15: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
16: const PetscInt *idx;
17: PetscInt start,end,*ai,*aj,bs,*nidx2;
18: PetscBT table;
21: m = a->mbs;
22: ai = a->i;
23: aj = a->j;
24: bs = A->rmap->bs;
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28: PetscBTCreate(m,&table);
29: PetscMalloc1(m+1,&nidx);
30: PetscMalloc1(A->rmap->N+1,&nidx2);
32: for (i=0; i<is_max; i++) {
33: /* Initialise the two local arrays */
34: isz = 0;
35: PetscBTMemzero(m,table);
37: /* Extract the indices, assume there can be duplicate entries */
38: ISGetIndices(is[i],&idx);
39: ISGetLocalSize(is[i],&n);
41: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42: for (j=0; j<n; ++j) {
43: ival = idx[j]/bs; /* convert the indices into block indices */
44: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(&is[i]);
50: k = 0;
51: for (j=0; j<ov; j++) { /* for each overlap*/
52: n = isz;
53: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
54: row = nidx[k];
55: start = ai[row];
56: end = ai[row+1];
57: for (l = start; l<end; l++) {
58: val = aj[l];
59: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60: }
61: }
62: }
63: /* expand the Index Set */
64: for (j=0; j<isz; j++) {
65: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66: }
67: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
68: }
69: PetscBTDestroy(&table);
70: PetscFree(nidx);
71: PetscFree(nidx2);
72: return(0);
73: }
75: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
89: ISGetIndices(isrow,&irow);
90: ISGetIndices(iscol,&icol);
91: ISGetLocalSize(isrow,&nrows);
92: ISGetLocalSize(iscol,&ncols);
94: PetscCalloc1(1+oldcols,&smap);
95: ssmap = smap;
96: PetscMalloc1(1+nrows,&lens);
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) lens[i]++;
105: }
106: }
107: /* Create and fill new matrix */
108: if (scall == MAT_REUSE_MATRIX) {
109: c = (Mat_SeqBAIJ*)((*B)->data);
111: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
113: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
115: C = *B;
116: } else {
117: MatCreate(PetscObjectComm((PetscObject)A),&C);
118: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119: MatSetType(C,((PetscObject)A)->type_name);
120: MatSeqBAIJSetPreallocation(C,bs,0,lens);
121: }
122: c = (Mat_SeqBAIJ*)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140: /* sort */
141: {
142: MatScalar *work;
143: PetscMalloc1(bs2,&work);
144: for (i=0; i<nrows; i++) {
145: PetscInt ilen;
146: mat_i = c->i[i];
147: mat_j = c->j + mat_i;
148: mat_a = c->a + mat_i*bs2;
149: ilen = c->ilen[i];
150: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151: }
152: PetscFree(work);
153: }
155: /* Free work space */
156: ISRestoreIndices(iscol,&icol);
157: PetscFree(smap);
158: PetscFree(lens);
159: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
162: ISRestoreIndices(isrow,&irow);
163: *B = C;
164: return(0);
165: }
167: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168: {
169: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
170: IS is1,is2;
172: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173: const PetscInt *irow,*icol;
176: ISGetIndices(isrow,&irow);
177: ISGetIndices(iscol,&icol);
178: ISGetLocalSize(isrow,&nrows);
179: ISGetLocalSize(iscol,&ncols);
181: /* Verify if the indices corespond to each element in a block
182: and form the IS with compressed IS */
183: maxmnbs = PetscMax(a->mbs,a->nbs);
184: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
185: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
186: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187: for (i=0; i<a->mbs; i++) {
188: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189: }
190: count = 0;
191: for (i=0; i<nrows; i++) {
192: j = irow[i] / bs;
193: if ((vary[j]--)==bs) iary[count++] = j;
194: }
195: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
197: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
198: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199: for (i=0; i<a->nbs; i++) {
200: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201: }
202: count = 0;
203: for (i=0; i<ncols; i++) {
204: j = icol[i] / bs;
205: if ((vary[j]--)==bs) iary[count++] = j;
206: }
207: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
208: ISRestoreIndices(isrow,&irow);
209: ISRestoreIndices(iscol,&icol);
210: PetscFree2(vary,iary);
212: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
213: ISDestroy(&is1);
214: ISDestroy(&is2);
215: return(0);
216: }
218: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219: {
221: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
222: Mat_SubSppt *submatj = c->submatis1;
225: (*submatj->destroy)(C);
226: MatDestroySubMatrix_Private(submatj);
227: return(0);
228: }
230: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231: {
233: PetscInt i;
234: Mat C;
235: Mat_SeqBAIJ *c;
236: Mat_SubSppt *submatj;
239: for (i=0; i<n; i++) {
240: C = (*mat)[i];
241: c = (Mat_SeqBAIJ*)C->data;
242: submatj = c->submatis1;
243: if (submatj) {
244: (*submatj->destroy)(C);
245: MatDestroySubMatrix_Private(submatj);
246: PetscFree(C->defaultvectype);
247: PetscLayoutDestroy(&C->rmap);
248: PetscLayoutDestroy(&C->cmap);
249: PetscHeaderDestroy(&C);
250: } else {
251: MatDestroy(&C);
252: }
253: }
254: PetscFree(*mat);
255: return(0);
256: }
258: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
259: {
261: PetscInt i;
264: if (scall == MAT_INITIAL_MATRIX) {
265: PetscCalloc1(n+1,B);
266: }
268: for (i=0; i<n; i++) {
269: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
270: }
271: return(0);
272: }
274: /* -------------------------------------------------------*/
275: /* Should check that shapes of vectors and matrices match */
276: /* -------------------------------------------------------*/
278: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
279: {
280: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
281: PetscScalar *z,sum;
282: const PetscScalar *x;
283: const MatScalar *v;
284: PetscErrorCode ierr;
285: PetscInt mbs,i,n;
286: const PetscInt *idx,*ii,*ridx=NULL;
287: PetscBool usecprow=a->compressedrow.use;
290: VecGetArrayRead(xx,&x);
291: VecGetArray(zz,&z);
293: if (usecprow) {
294: mbs = a->compressedrow.nrows;
295: ii = a->compressedrow.i;
296: ridx = a->compressedrow.rindex;
297: PetscMemzero(z,a->mbs*sizeof(PetscScalar));
298: } else {
299: mbs = a->mbs;
300: ii = a->i;
301: }
303: for (i=0; i<mbs; i++) {
304: n = ii[1] - ii[0];
305: v = a->a + ii[0];
306: idx = a->j + ii[0];
307: ii++;
308: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
309: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310: sum = 0.0;
311: PetscSparseDensePlusDot(sum,x,v,idx,n);
312: if (usecprow) {
313: z[ridx[i]] = sum;
314: } else {
315: z[i] = sum;
316: }
317: }
318: VecRestoreArrayRead(xx,&x);
319: VecRestoreArray(zz,&z);
320: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
321: return(0);
322: }
324: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
325: {
326: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
327: PetscScalar *z = 0,sum1,sum2,*zarray;
328: const PetscScalar *x,*xb;
329: PetscScalar x1,x2;
330: const MatScalar *v;
331: PetscErrorCode ierr;
332: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
333: PetscBool usecprow=a->compressedrow.use;
336: VecGetArrayRead(xx,&x);
337: VecGetArray(zz,&zarray);
339: idx = a->j;
340: v = a->a;
341: if (usecprow) {
342: mbs = a->compressedrow.nrows;
343: ii = a->compressedrow.i;
344: ridx = a->compressedrow.rindex;
345: PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
346: } else {
347: mbs = a->mbs;
348: ii = a->i;
349: z = zarray;
350: }
352: for (i=0; i<mbs; i++) {
353: n = ii[1] - ii[0]; ii++;
354: sum1 = 0.0; sum2 = 0.0;
355: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
356: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357: for (j=0; j<n; j++) {
358: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
359: sum1 += v[0]*x1 + v[2]*x2;
360: sum2 += v[1]*x1 + v[3]*x2;
361: v += 4;
362: }
363: if (usecprow) z = zarray + 2*ridx[i];
364: z[0] = sum1; z[1] = sum2;
365: if (!usecprow) z += 2;
366: }
367: VecRestoreArrayRead(xx,&x);
368: VecRestoreArray(zz,&zarray);
369: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
370: return(0);
371: }
373: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
374: {
375: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
376: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
377: const PetscScalar *x,*xb;
378: const MatScalar *v;
379: PetscErrorCode ierr;
380: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
381: PetscBool usecprow=a->compressedrow.use;
383: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
384: #pragma disjoint(*v,*z,*xb)
385: #endif
388: VecGetArrayRead(xx,&x);
389: VecGetArray(zz,&zarray);
391: idx = a->j;
392: v = a->a;
393: if (usecprow) {
394: mbs = a->compressedrow.nrows;
395: ii = a->compressedrow.i;
396: ridx = a->compressedrow.rindex;
397: PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
398: } else {
399: mbs = a->mbs;
400: ii = a->i;
401: z = zarray;
402: }
404: for (i=0; i<mbs; i++) {
405: n = ii[1] - ii[0]; ii++;
406: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
407: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
408: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
409: for (j=0; j<n; j++) {
410: xb = x + 3*(*idx++);
411: x1 = xb[0];
412: x2 = xb[1];
413: x3 = xb[2];
415: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
416: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
417: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
418: v += 9;
419: }
420: if (usecprow) z = zarray + 3*ridx[i];
421: z[0] = sum1; z[1] = sum2; z[2] = sum3;
422: if (!usecprow) z += 3;
423: }
424: VecRestoreArrayRead(xx,&x);
425: VecRestoreArray(zz,&zarray);
426: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
427: return(0);
428: }
430: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
431: {
432: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
433: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
434: const PetscScalar *x,*xb;
435: const MatScalar *v;
436: PetscErrorCode ierr;
437: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
438: PetscBool usecprow=a->compressedrow.use;
441: VecGetArrayRead(xx,&x);
442: VecGetArray(zz,&zarray);
444: idx = a->j;
445: v = a->a;
446: if (usecprow) {
447: mbs = a->compressedrow.nrows;
448: ii = a->compressedrow.i;
449: ridx = a->compressedrow.rindex;
450: PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
451: } else {
452: mbs = a->mbs;
453: ii = a->i;
454: z = zarray;
455: }
457: for (i=0; i<mbs; i++) {
458: n = ii[1] - ii[0];
459: ii++;
460: sum1 = 0.0;
461: sum2 = 0.0;
462: sum3 = 0.0;
463: sum4 = 0.0;
465: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
466: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
467: for (j=0; j<n; j++) {
468: xb = x + 4*(*idx++);
469: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
470: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
471: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
472: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
473: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
474: v += 16;
475: }
476: if (usecprow) z = zarray + 4*ridx[i];
477: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
478: if (!usecprow) z += 4;
479: }
480: VecRestoreArrayRead(xx,&x);
481: VecRestoreArray(zz,&zarray);
482: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
483: return(0);
484: }
486: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
487: {
488: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
489: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
490: const PetscScalar *xb,*x;
491: const MatScalar *v;
492: PetscErrorCode ierr;
493: const PetscInt *idx,*ii,*ridx=NULL;
494: PetscInt mbs,i,j,n;
495: PetscBool usecprow=a->compressedrow.use;
498: VecGetArrayRead(xx,&x);
499: VecGetArray(zz,&zarray);
501: idx = a->j;
502: v = a->a;
503: if (usecprow) {
504: mbs = a->compressedrow.nrows;
505: ii = a->compressedrow.i;
506: ridx = a->compressedrow.rindex;
507: PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
508: } else {
509: mbs = a->mbs;
510: ii = a->i;
511: z = zarray;
512: }
514: for (i=0; i<mbs; i++) {
515: n = ii[1] - ii[0]; ii++;
516: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
517: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
518: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
519: for (j=0; j<n; j++) {
520: xb = x + 5*(*idx++);
521: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
522: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
523: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
524: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
525: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
526: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
527: v += 25;
528: }
529: if (usecprow) z = zarray + 5*ridx[i];
530: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
531: if (!usecprow) z += 5;
532: }
533: VecRestoreArrayRead(xx,&x);
534: VecRestoreArray(zz,&zarray);
535: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
536: return(0);
537: }
541: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
542: {
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
544: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
545: const PetscScalar *x,*xb;
546: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
547: const MatScalar *v;
548: PetscErrorCode ierr;
549: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
550: PetscBool usecprow=a->compressedrow.use;
553: VecGetArrayRead(xx,&x);
554: VecGetArray(zz,&zarray);
556: idx = a->j;
557: v = a->a;
558: if (usecprow) {
559: mbs = a->compressedrow.nrows;
560: ii = a->compressedrow.i;
561: ridx = a->compressedrow.rindex;
562: PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
563: } else {
564: mbs = a->mbs;
565: ii = a->i;
566: z = zarray;
567: }
569: for (i=0; i<mbs; i++) {
570: n = ii[1] - ii[0];
571: ii++;
572: sum1 = 0.0;
573: sum2 = 0.0;
574: sum3 = 0.0;
575: sum4 = 0.0;
576: sum5 = 0.0;
577: sum6 = 0.0;
579: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
580: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
581: for (j=0; j<n; j++) {
582: xb = x + 6*(*idx++);
583: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
584: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
585: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
586: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
587: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
588: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
589: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
590: v += 36;
591: }
592: if (usecprow) z = zarray + 6*ridx[i];
593: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
594: if (!usecprow) z += 6;
595: }
597: VecRestoreArrayRead(xx,&x);
598: VecRestoreArray(zz,&zarray);
599: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
600: return(0);
601: }
603: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
604: {
605: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
606: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
607: const PetscScalar *x,*xb;
608: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
609: const MatScalar *v;
610: PetscErrorCode ierr;
611: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
612: PetscBool usecprow=a->compressedrow.use;
615: VecGetArrayRead(xx,&x);
616: VecGetArray(zz,&zarray);
618: idx = a->j;
619: v = a->a;
620: if (usecprow) {
621: mbs = a->compressedrow.nrows;
622: ii = a->compressedrow.i;
623: ridx = a->compressedrow.rindex;
624: PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
625: } else {
626: mbs = a->mbs;
627: ii = a->i;
628: z = zarray;
629: }
631: for (i=0; i<mbs; i++) {
632: n = ii[1] - ii[0];
633: ii++;
634: sum1 = 0.0;
635: sum2 = 0.0;
636: sum3 = 0.0;
637: sum4 = 0.0;
638: sum5 = 0.0;
639: sum6 = 0.0;
640: sum7 = 0.0;
642: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
643: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
644: for (j=0; j<n; j++) {
645: xb = x + 7*(*idx++);
646: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
647: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
648: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
649: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
650: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
651: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
652: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
653: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
654: v += 49;
655: }
656: if (usecprow) z = zarray + 7*ridx[i];
657: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
658: if (!usecprow) z += 7;
659: }
661: VecRestoreArrayRead(xx,&x);
662: VecRestoreArray(zz,&zarray);
663: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
664: return(0);
665: }
667: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
668: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
669: {
670: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
671: PetscScalar *z = 0,*work,*workt,*zarray;
672: const PetscScalar *x,*xb;
673: const MatScalar *v;
675: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
676: const PetscInt *idx,*ii,*ridx=NULL;
677: PetscInt k;
678: PetscBool usecprow=a->compressedrow.use;
680: __m256d a0,a1,a2,a3,a4,a5;
681: __m256d w0,w1,w2,w3;
682: __m256d z0,z1,z2;
683: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
686: VecGetArrayRead(xx,&x);
687: VecGetArray(zz,&zarray);
689: idx = a->j;
690: v = a->a;
691: if (usecprow) {
692: mbs = a->compressedrow.nrows;
693: ii = a->compressedrow.i;
694: ridx = a->compressedrow.rindex;
695: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
696: } else {
697: mbs = a->mbs;
698: ii = a->i;
699: z = zarray;
700: }
702: if (!a->mult_work) {
703: k = PetscMax(A->rmap->n,A->cmap->n);
704: PetscMalloc1(k+1,&a->mult_work);
705: }
707: work = a->mult_work;
708: for (i=0; i<mbs; i++) {
709: n = ii[1] - ii[0]; ii++;
710: workt = work;
711: for (j=0; j<n; j++) {
712: xb = x + bs*(*idx++);
713: for (k=0; k<bs; k++) workt[k] = xb[k];
714: workt += bs;
715: }
716: if (usecprow) z = zarray + bs*ridx[i];
718: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
720: for (j=0; j<n; j++) {
721: // first column of a
722: w0 = _mm256_set1_pd(work[j*9 ]);
723: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
724: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
725: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
727: // second column of a
728: w1 = _mm256_set1_pd(work[j*9+ 1]);
729: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
730: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
731: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
733: // third column of a
734: w2 = _mm256_set1_pd(work[j*9 +2]);
735: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
736: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
737: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
739: // fourth column of a
740: w3 = _mm256_set1_pd(work[j*9+ 3]);
741: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
742: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
743: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
745: // fifth column of a
746: w0 = _mm256_set1_pd(work[j*9+ 4]);
747: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
748: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
749: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
751: // sixth column of a
752: w1 = _mm256_set1_pd(work[j*9+ 5]);
753: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
754: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
755: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
757: // seventh column of a
758: w2 = _mm256_set1_pd(work[j*9+ 6]);
759: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
760: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
761: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
763: // eigth column of a
764: w3 = _mm256_set1_pd(work[j*9+ 7]);
765: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
766: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
767: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
769: // ninth column of a
770: w0 = _mm256_set1_pd(work[j*9+ 8]);
771: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
772: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
773: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
774: }
776: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
778: v += n*bs2;
779: if (!usecprow) z += bs;
780: }
781: VecRestoreArrayRead(xx,&x);
782: VecRestoreArray(zz,&zarray);
783: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
784: return(0);
785: }
786: #endif
788: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
789: {
790: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
791: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
792: const PetscScalar *x,*xb;
793: PetscScalar *zarray,xv;
794: const MatScalar *v;
795: PetscErrorCode ierr;
796: const PetscInt *ii,*ij=a->j,*idx;
797: PetscInt mbs,i,j,k,n,*ridx=NULL;
798: PetscBool usecprow=a->compressedrow.use;
801: VecGetArrayRead(xx,&x);
802: VecGetArray(zz,&zarray);
804: v = a->a;
805: if (usecprow) {
806: mbs = a->compressedrow.nrows;
807: ii = a->compressedrow.i;
808: ridx = a->compressedrow.rindex;
809: PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));
810: } else {
811: mbs = a->mbs;
812: ii = a->i;
813: z = zarray;
814: }
816: for (i=0; i<mbs; i++) {
817: n = ii[i+1] - ii[i];
818: idx = ij + ii[i];
819: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
820: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
822: for (j=0; j<n; j++) {
823: xb = x + 11*(idx[j]);
825: for (k=0; k<11; k++) {
826: xv = xb[k];
827: sum1 += v[0]*xv;
828: sum2 += v[1]*xv;
829: sum3 += v[2]*xv;
830: sum4 += v[3]*xv;
831: sum5 += v[4]*xv;
832: sum6 += v[5]*xv;
833: sum7 += v[6]*xv;
834: sum8 += v[7]*xv;
835: sum9 += v[8]*xv;
836: sum10 += v[9]*xv;
837: sum11 += v[10]*xv;
838: v += 11;
839: }
840: }
841: if (usecprow) z = zarray + 11*ridx[i];
842: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
843: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
845: if (!usecprow) z += 11;
846: }
848: VecRestoreArrayRead(xx,&x);
849: VecRestoreArray(zz,&zarray);
850: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
851: return(0);
852: }
854: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
855: /* Default MatMult for block size 15 */
857: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
858: {
859: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
860: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
861: const PetscScalar *x,*xb;
862: PetscScalar *zarray,xv;
863: const MatScalar *v;
864: PetscErrorCode ierr;
865: const PetscInt *ii,*ij=a->j,*idx;
866: PetscInt mbs,i,j,k,n,*ridx=NULL;
867: PetscBool usecprow=a->compressedrow.use;
870: VecGetArrayRead(xx,&x);
871: VecGetArray(zz,&zarray);
873: v = a->a;
874: if (usecprow) {
875: mbs = a->compressedrow.nrows;
876: ii = a->compressedrow.i;
877: ridx = a->compressedrow.rindex;
878: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
879: } else {
880: mbs = a->mbs;
881: ii = a->i;
882: z = zarray;
883: }
885: for (i=0; i<mbs; i++) {
886: n = ii[i+1] - ii[i];
887: idx = ij + ii[i];
888: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
889: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
891: for (j=0; j<n; j++) {
892: xb = x + 15*(idx[j]);
894: for (k=0; k<15; k++) {
895: xv = xb[k];
896: sum1 += v[0]*xv;
897: sum2 += v[1]*xv;
898: sum3 += v[2]*xv;
899: sum4 += v[3]*xv;
900: sum5 += v[4]*xv;
901: sum6 += v[5]*xv;
902: sum7 += v[6]*xv;
903: sum8 += v[7]*xv;
904: sum9 += v[8]*xv;
905: sum10 += v[9]*xv;
906: sum11 += v[10]*xv;
907: sum12 += v[11]*xv;
908: sum13 += v[12]*xv;
909: sum14 += v[13]*xv;
910: sum15 += v[14]*xv;
911: v += 15;
912: }
913: }
914: if (usecprow) z = zarray + 15*ridx[i];
915: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
916: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
918: if (!usecprow) z += 15;
919: }
921: VecRestoreArrayRead(xx,&x);
922: VecRestoreArray(zz,&zarray);
923: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
924: return(0);
925: }
927: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
928: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
929: {
930: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
931: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
932: const PetscScalar *x,*xb;
933: PetscScalar x1,x2,x3,x4,*zarray;
934: const MatScalar *v;
935: PetscErrorCode ierr;
936: const PetscInt *ii,*ij=a->j,*idx;
937: PetscInt mbs,i,j,n,*ridx=NULL;
938: PetscBool usecprow=a->compressedrow.use;
941: VecGetArrayRead(xx,&x);
942: VecGetArray(zz,&zarray);
944: v = a->a;
945: if (usecprow) {
946: mbs = a->compressedrow.nrows;
947: ii = a->compressedrow.i;
948: ridx = a->compressedrow.rindex;
949: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
950: } else {
951: mbs = a->mbs;
952: ii = a->i;
953: z = zarray;
954: }
956: for (i=0; i<mbs; i++) {
957: n = ii[i+1] - ii[i];
958: idx = ij + ii[i];
959: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
960: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
962: for (j=0; j<n; j++) {
963: xb = x + 15*(idx[j]);
964: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
966: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
967: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
968: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
969: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
970: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
971: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
972: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
973: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
974: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
975: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
976: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
977: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
978: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
979: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
980: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
982: v += 60;
984: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
986: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
987: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
988: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
989: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
990: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
991: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
992: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
993: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
994: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
995: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
996: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
997: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
998: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
999: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1000: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1001: v += 60;
1003: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1004: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1005: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1006: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1007: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1008: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1009: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1010: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1011: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1012: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1013: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1014: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1015: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1016: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1017: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1018: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1019: v += 60;
1021: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1022: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1023: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1024: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1025: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1026: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1027: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1028: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1029: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1030: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1031: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1032: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1033: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1034: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1035: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1036: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1037: v += 45;
1038: }
1039: if (usecprow) z = zarray + 15*ridx[i];
1040: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1041: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1043: if (!usecprow) z += 15;
1044: }
1046: VecRestoreArrayRead(xx,&x);
1047: VecRestoreArray(zz,&zarray);
1048: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1049: return(0);
1050: }
1052: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1053: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1054: {
1055: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1056: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1057: const PetscScalar *x,*xb;
1058: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1059: const MatScalar *v;
1060: PetscErrorCode ierr;
1061: const PetscInt *ii,*ij=a->j,*idx;
1062: PetscInt mbs,i,j,n,*ridx=NULL;
1063: PetscBool usecprow=a->compressedrow.use;
1066: VecGetArrayRead(xx,&x);
1067: VecGetArray(zz,&zarray);
1069: v = a->a;
1070: if (usecprow) {
1071: mbs = a->compressedrow.nrows;
1072: ii = a->compressedrow.i;
1073: ridx = a->compressedrow.rindex;
1074: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1075: } else {
1076: mbs = a->mbs;
1077: ii = a->i;
1078: z = zarray;
1079: }
1081: for (i=0; i<mbs; i++) {
1082: n = ii[i+1] - ii[i];
1083: idx = ij + ii[i];
1084: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1085: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1087: for (j=0; j<n; j++) {
1088: xb = x + 15*(idx[j]);
1089: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1090: x8 = xb[7];
1092: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1093: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1094: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1095: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1096: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1097: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1098: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1099: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1100: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1101: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1102: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1103: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1104: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1105: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1106: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1107: v += 120;
1109: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1111: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1112: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1113: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1114: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1115: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1116: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1117: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1118: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1119: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1120: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1121: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1122: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1123: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1124: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1125: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1126: v += 105;
1127: }
1128: if (usecprow) z = zarray + 15*ridx[i];
1129: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1130: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1132: if (!usecprow) z += 15;
1133: }
1135: VecRestoreArrayRead(xx,&x);
1136: VecRestoreArray(zz,&zarray);
1137: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1138: return(0);
1139: }
1141: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1143: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1144: {
1145: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1146: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1147: const PetscScalar *x,*xb;
1148: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1149: const MatScalar *v;
1150: PetscErrorCode ierr;
1151: const PetscInt *ii,*ij=a->j,*idx;
1152: PetscInt mbs,i,j,n,*ridx=NULL;
1153: PetscBool usecprow=a->compressedrow.use;
1156: VecGetArrayRead(xx,&x);
1157: VecGetArray(zz,&zarray);
1159: v = a->a;
1160: if (usecprow) {
1161: mbs = a->compressedrow.nrows;
1162: ii = a->compressedrow.i;
1163: ridx = a->compressedrow.rindex;
1164: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1165: } else {
1166: mbs = a->mbs;
1167: ii = a->i;
1168: z = zarray;
1169: }
1171: for (i=0; i<mbs; i++) {
1172: n = ii[i+1] - ii[i];
1173: idx = ij + ii[i];
1174: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1175: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1177: for (j=0; j<n; j++) {
1178: xb = x + 15*(idx[j]);
1179: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1180: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1182: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1183: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1184: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1185: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1186: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1187: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1188: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1189: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1190: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1191: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1192: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1193: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1194: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1195: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1196: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1197: v += 225;
1198: }
1199: if (usecprow) z = zarray + 15*ridx[i];
1200: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1201: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1203: if (!usecprow) z += 15;
1204: }
1206: VecRestoreArrayRead(xx,&x);
1207: VecRestoreArray(zz,&zarray);
1208: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1209: return(0);
1210: }
1213: /*
1214: This will not work with MatScalar == float because it calls the BLAS
1215: */
1216: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1217: {
1218: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1219: PetscScalar *z = 0,*work,*workt,*zarray;
1220: const PetscScalar *x,*xb;
1221: const MatScalar *v;
1222: PetscErrorCode ierr;
1223: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1224: const PetscInt *idx,*ii,*ridx=NULL;
1225: PetscInt ncols,k;
1226: PetscBool usecprow=a->compressedrow.use;
1229: VecGetArrayRead(xx,&x);
1230: VecGetArray(zz,&zarray);
1232: idx = a->j;
1233: v = a->a;
1234: if (usecprow) {
1235: mbs = a->compressedrow.nrows;
1236: ii = a->compressedrow.i;
1237: ridx = a->compressedrow.rindex;
1238: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1239: } else {
1240: mbs = a->mbs;
1241: ii = a->i;
1242: z = zarray;
1243: }
1245: if (!a->mult_work) {
1246: k = PetscMax(A->rmap->n,A->cmap->n);
1247: PetscMalloc1(k+1,&a->mult_work);
1248: }
1249: work = a->mult_work;
1250: for (i=0; i<mbs; i++) {
1251: n = ii[1] - ii[0]; ii++;
1252: ncols = n*bs;
1253: workt = work;
1254: for (j=0; j<n; j++) {
1255: xb = x + bs*(*idx++);
1256: for (k=0; k<bs; k++) workt[k] = xb[k];
1257: workt += bs;
1258: }
1259: if (usecprow) z = zarray + bs*ridx[i];
1260: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1261: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1262: v += n*bs2;
1263: if (!usecprow) z += bs;
1264: }
1265: VecRestoreArrayRead(xx,&x);
1266: VecRestoreArray(zz,&zarray);
1267: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1268: return(0);
1269: }
1271: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1272: {
1273: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1274: const PetscScalar *x;
1275: PetscScalar *y,*z,sum;
1276: const MatScalar *v;
1277: PetscErrorCode ierr;
1278: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1279: const PetscInt *idx,*ii;
1280: PetscBool usecprow=a->compressedrow.use;
1283: VecGetArrayRead(xx,&x);
1284: VecGetArrayPair(yy,zz,&y,&z);
1286: idx = a->j;
1287: v = a->a;
1288: if (usecprow) {
1289: if (zz != yy) {
1290: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1291: }
1292: mbs = a->compressedrow.nrows;
1293: ii = a->compressedrow.i;
1294: ridx = a->compressedrow.rindex;
1295: } else {
1296: ii = a->i;
1297: }
1299: for (i=0; i<mbs; i++) {
1300: n = ii[1] - ii[0];
1301: ii++;
1302: if (!usecprow) {
1303: sum = y[i];
1304: } else {
1305: sum = y[ridx[i]];
1306: }
1307: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1308: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1309: PetscSparseDensePlusDot(sum,x,v,idx,n);
1310: v += n;
1311: idx += n;
1312: if (usecprow) {
1313: z[ridx[i]] = sum;
1314: } else {
1315: z[i] = sum;
1316: }
1317: }
1318: VecRestoreArrayRead(xx,&x);
1319: VecRestoreArrayPair(yy,zz,&y,&z);
1320: PetscLogFlops(2.0*a->nz);
1321: return(0);
1322: }
1324: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1325: {
1326: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1327: PetscScalar *y = 0,*z = 0,sum1,sum2;
1328: const PetscScalar *x,*xb;
1329: PetscScalar x1,x2,*yarray,*zarray;
1330: const MatScalar *v;
1331: PetscErrorCode ierr;
1332: PetscInt mbs = a->mbs,i,n,j;
1333: const PetscInt *idx,*ii,*ridx = NULL;
1334: PetscBool usecprow = a->compressedrow.use;
1337: VecGetArrayRead(xx,&x);
1338: VecGetArrayPair(yy,zz,&yarray,&zarray);
1340: idx = a->j;
1341: v = a->a;
1342: if (usecprow) {
1343: if (zz != yy) {
1344: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1345: }
1346: mbs = a->compressedrow.nrows;
1347: ii = a->compressedrow.i;
1348: ridx = a->compressedrow.rindex;
1349: } else {
1350: ii = a->i;
1351: y = yarray;
1352: z = zarray;
1353: }
1355: for (i=0; i<mbs; i++) {
1356: n = ii[1] - ii[0]; ii++;
1357: if (usecprow) {
1358: z = zarray + 2*ridx[i];
1359: y = yarray + 2*ridx[i];
1360: }
1361: sum1 = y[0]; sum2 = y[1];
1362: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1363: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364: for (j=0; j<n; j++) {
1365: xb = x + 2*(*idx++);
1366: x1 = xb[0];
1367: x2 = xb[1];
1369: sum1 += v[0]*x1 + v[2]*x2;
1370: sum2 += v[1]*x1 + v[3]*x2;
1371: v += 4;
1372: }
1373: z[0] = sum1; z[1] = sum2;
1374: if (!usecprow) {
1375: z += 2; y += 2;
1376: }
1377: }
1378: VecRestoreArrayRead(xx,&x);
1379: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1380: PetscLogFlops(4.0*a->nz);
1381: return(0);
1382: }
1384: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1385: {
1386: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1387: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1388: const PetscScalar *x,*xb;
1389: const MatScalar *v;
1390: PetscErrorCode ierr;
1391: PetscInt mbs = a->mbs,i,j,n;
1392: const PetscInt *idx,*ii,*ridx = NULL;
1393: PetscBool usecprow = a->compressedrow.use;
1396: VecGetArrayRead(xx,&x);
1397: VecGetArrayPair(yy,zz,&yarray,&zarray);
1399: idx = a->j;
1400: v = a->a;
1401: if (usecprow) {
1402: if (zz != yy) {
1403: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1404: }
1405: mbs = a->compressedrow.nrows;
1406: ii = a->compressedrow.i;
1407: ridx = a->compressedrow.rindex;
1408: } else {
1409: ii = a->i;
1410: y = yarray;
1411: z = zarray;
1412: }
1414: for (i=0; i<mbs; i++) {
1415: n = ii[1] - ii[0]; ii++;
1416: if (usecprow) {
1417: z = zarray + 3*ridx[i];
1418: y = yarray + 3*ridx[i];
1419: }
1420: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1421: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1422: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1423: for (j=0; j<n; j++) {
1424: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1425: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1426: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1427: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1428: v += 9;
1429: }
1430: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1431: if (!usecprow) {
1432: z += 3; y += 3;
1433: }
1434: }
1435: VecRestoreArrayRead(xx,&x);
1436: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1437: PetscLogFlops(18.0*a->nz);
1438: return(0);
1439: }
1441: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1442: {
1443: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1444: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1445: const PetscScalar *x,*xb;
1446: const MatScalar *v;
1447: PetscErrorCode ierr;
1448: PetscInt mbs = a->mbs,i,j,n;
1449: const PetscInt *idx,*ii,*ridx=NULL;
1450: PetscBool usecprow=a->compressedrow.use;
1453: VecGetArrayRead(xx,&x);
1454: VecGetArrayPair(yy,zz,&yarray,&zarray);
1456: idx = a->j;
1457: v = a->a;
1458: if (usecprow) {
1459: if (zz != yy) {
1460: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1461: }
1462: mbs = a->compressedrow.nrows;
1463: ii = a->compressedrow.i;
1464: ridx = a->compressedrow.rindex;
1465: } else {
1466: ii = a->i;
1467: y = yarray;
1468: z = zarray;
1469: }
1471: for (i=0; i<mbs; i++) {
1472: n = ii[1] - ii[0]; ii++;
1473: if (usecprow) {
1474: z = zarray + 4*ridx[i];
1475: y = yarray + 4*ridx[i];
1476: }
1477: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1478: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1479: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1480: for (j=0; j<n; j++) {
1481: xb = x + 4*(*idx++);
1482: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1483: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1484: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1485: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1486: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1487: v += 16;
1488: }
1489: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1490: if (!usecprow) {
1491: z += 4; y += 4;
1492: }
1493: }
1494: VecRestoreArrayRead(xx,&x);
1495: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1496: PetscLogFlops(32.0*a->nz);
1497: return(0);
1498: }
1500: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1501: {
1502: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1503: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1504: const PetscScalar *x,*xb;
1505: PetscScalar *yarray,*zarray;
1506: const MatScalar *v;
1507: PetscErrorCode ierr;
1508: PetscInt mbs = a->mbs,i,j,n;
1509: const PetscInt *idx,*ii,*ridx = NULL;
1510: PetscBool usecprow=a->compressedrow.use;
1513: VecGetArrayRead(xx,&x);
1514: VecGetArrayPair(yy,zz,&yarray,&zarray);
1516: idx = a->j;
1517: v = a->a;
1518: if (usecprow) {
1519: if (zz != yy) {
1520: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1521: }
1522: mbs = a->compressedrow.nrows;
1523: ii = a->compressedrow.i;
1524: ridx = a->compressedrow.rindex;
1525: } else {
1526: ii = a->i;
1527: y = yarray;
1528: z = zarray;
1529: }
1531: for (i=0; i<mbs; i++) {
1532: n = ii[1] - ii[0]; ii++;
1533: if (usecprow) {
1534: z = zarray + 5*ridx[i];
1535: y = yarray + 5*ridx[i];
1536: }
1537: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1538: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1539: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1540: for (j=0; j<n; j++) {
1541: xb = x + 5*(*idx++);
1542: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1543: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1544: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1545: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1546: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1547: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1548: v += 25;
1549: }
1550: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1551: if (!usecprow) {
1552: z += 5; y += 5;
1553: }
1554: }
1555: VecRestoreArrayRead(xx,&x);
1556: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1557: PetscLogFlops(50.0*a->nz);
1558: return(0);
1559: }
1560: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1561: {
1562: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1563: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1564: const PetscScalar *x,*xb;
1565: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1566: const MatScalar *v;
1567: PetscErrorCode ierr;
1568: PetscInt mbs = a->mbs,i,j,n;
1569: const PetscInt *idx,*ii,*ridx=NULL;
1570: PetscBool usecprow=a->compressedrow.use;
1573: VecGetArrayRead(xx,&x);
1574: VecGetArrayPair(yy,zz,&yarray,&zarray);
1576: idx = a->j;
1577: v = a->a;
1578: if (usecprow) {
1579: if (zz != yy) {
1580: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1581: }
1582: mbs = a->compressedrow.nrows;
1583: ii = a->compressedrow.i;
1584: ridx = a->compressedrow.rindex;
1585: } else {
1586: ii = a->i;
1587: y = yarray;
1588: z = zarray;
1589: }
1591: for (i=0; i<mbs; i++) {
1592: n = ii[1] - ii[0]; ii++;
1593: if (usecprow) {
1594: z = zarray + 6*ridx[i];
1595: y = yarray + 6*ridx[i];
1596: }
1597: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1598: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1599: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600: for (j=0; j<n; j++) {
1601: xb = x + 6*(*idx++);
1602: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1603: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1604: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1605: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1606: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1607: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1608: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1609: v += 36;
1610: }
1611: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1612: if (!usecprow) {
1613: z += 6; y += 6;
1614: }
1615: }
1616: VecRestoreArrayRead(xx,&x);
1617: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1618: PetscLogFlops(72.0*a->nz);
1619: return(0);
1620: }
1622: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1623: {
1624: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1625: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1626: const PetscScalar *x,*xb;
1627: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1628: const MatScalar *v;
1629: PetscErrorCode ierr;
1630: PetscInt mbs = a->mbs,i,j,n;
1631: const PetscInt *idx,*ii,*ridx = NULL;
1632: PetscBool usecprow=a->compressedrow.use;
1635: VecGetArrayRead(xx,&x);
1636: VecGetArrayPair(yy,zz,&yarray,&zarray);
1638: idx = a->j;
1639: v = a->a;
1640: if (usecprow) {
1641: if (zz != yy) {
1642: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1643: }
1644: mbs = a->compressedrow.nrows;
1645: ii = a->compressedrow.i;
1646: ridx = a->compressedrow.rindex;
1647: } else {
1648: ii = a->i;
1649: y = yarray;
1650: z = zarray;
1651: }
1653: for (i=0; i<mbs; i++) {
1654: n = ii[1] - ii[0]; ii++;
1655: if (usecprow) {
1656: z = zarray + 7*ridx[i];
1657: y = yarray + 7*ridx[i];
1658: }
1659: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1660: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1661: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1662: for (j=0; j<n; j++) {
1663: xb = x + 7*(*idx++);
1664: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1665: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1666: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1667: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1668: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1669: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1670: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1671: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1672: v += 49;
1673: }
1674: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1675: if (!usecprow) {
1676: z += 7; y += 7;
1677: }
1678: }
1679: VecRestoreArrayRead(xx,&x);
1680: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1681: PetscLogFlops(98.0*a->nz);
1682: return(0);
1683: }
1685: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1686: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1687: {
1688: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1689: PetscScalar *z = 0,*work,*workt,*zarray;
1690: const PetscScalar *x,*xb;
1691: const MatScalar *v;
1693: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1694: PetscInt k;
1695: PetscBool usecprow=a->compressedrow.use;
1696: const PetscInt *idx,*ii,*ridx=NULL;
1698: __m256d a0,a1,a2,a3,a4,a5;
1699: __m256d w0,w1,w2,w3;
1700: __m256d z0,z1,z2;
1701: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1704: VecCopy(yy,zz);
1705: VecGetArrayRead(xx,&x);
1706: VecGetArray(zz,&zarray);
1708: idx = a->j;
1709: v = a->a;
1710: if (usecprow) {
1711: mbs = a->compressedrow.nrows;
1712: ii = a->compressedrow.i;
1713: ridx = a->compressedrow.rindex;
1714: } else {
1715: mbs = a->mbs;
1716: ii = a->i;
1717: z = zarray;
1718: }
1720: if (!a->mult_work) {
1721: k = PetscMax(A->rmap->n,A->cmap->n);
1722: PetscMalloc1(k+1,&a->mult_work);
1723: }
1725: work = a->mult_work;
1726: for (i=0; i<mbs; i++) {
1727: n = ii[1] - ii[0]; ii++;
1728: workt = work;
1729: for (j=0; j<n; j++) {
1730: xb = x + bs*(*idx++);
1731: for (k=0; k<bs; k++) workt[k] = xb[k];
1732: workt += bs;
1733: }
1734: if (usecprow) z = zarray + bs*ridx[i];
1736: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1738: for (j=0; j<n; j++) {
1739: // first column of a
1740: w0 = _mm256_set1_pd(work[j*9 ]);
1741: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1742: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1743: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1745: // second column of a
1746: w1 = _mm256_set1_pd(work[j*9+ 1]);
1747: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1748: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1749: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1751: // third column of a
1752: w2 = _mm256_set1_pd(work[j*9+ 2]);
1753: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1754: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1755: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1757: // fourth column of a
1758: w3 = _mm256_set1_pd(work[j*9+ 3]);
1759: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1760: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1761: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1763: // fifth column of a
1764: w0 = _mm256_set1_pd(work[j*9+ 4]);
1765: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1766: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1767: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1769: // sixth column of a
1770: w1 = _mm256_set1_pd(work[j*9+ 5]);
1771: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1772: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1773: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1775: // seventh column of a
1776: w2 = _mm256_set1_pd(work[j*9+ 6]);
1777: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1778: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1779: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1781: // eigth column of a
1782: w3 = _mm256_set1_pd(work[j*9+ 7]);
1783: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1784: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1785: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1787: // ninth column of a
1788: w0 = _mm256_set1_pd(work[j*9+ 8]);
1789: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1790: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1791: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1792: }
1794: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1796: v += n*bs2;
1797: if (!usecprow) z += bs;
1798: }
1799: VecRestoreArrayRead(xx,&x);
1800: VecRestoreArray(zz,&zarray);
1801: PetscLogFlops(162.0*a->nz);
1802: return(0);
1803: }
1804: #endif
1806: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1807: {
1808: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1809: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1810: const PetscScalar *x,*xb;
1811: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1812: const MatScalar *v;
1813: PetscErrorCode ierr;
1814: PetscInt mbs = a->mbs,i,j,n;
1815: const PetscInt *idx,*ii,*ridx = NULL;
1816: PetscBool usecprow=a->compressedrow.use;
1819: VecGetArrayRead(xx,&x);
1820: VecGetArrayPair(yy,zz,&yarray,&zarray);
1822: idx = a->j;
1823: v = a->a;
1824: if (usecprow) {
1825: if (zz != yy) {
1826: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1827: }
1828: mbs = a->compressedrow.nrows;
1829: ii = a->compressedrow.i;
1830: ridx = a->compressedrow.rindex;
1831: } else {
1832: ii = a->i;
1833: y = yarray;
1834: z = zarray;
1835: }
1837: for (i=0; i<mbs; i++) {
1838: n = ii[1] - ii[0]; ii++;
1839: if (usecprow) {
1840: z = zarray + 11*ridx[i];
1841: y = yarray + 11*ridx[i];
1842: }
1843: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1844: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1845: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1846: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1847: for (j=0; j<n; j++) {
1848: xb = x + 11*(*idx++);
1849: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1850: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1851: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1852: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1853: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1854: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1855: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1856: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1857: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1858: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1859: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1860: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1861: v += 121;
1862: }
1863: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1864: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1865: if (!usecprow) {
1866: z += 11; y += 11;
1867: }
1868: }
1869: VecRestoreArrayRead(xx,&x);
1870: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1871: PetscLogFlops(242.0*a->nz);
1872: return(0);
1873: }
1875: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1876: {
1877: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1878: PetscScalar *z = 0,*work,*workt,*zarray;
1879: const PetscScalar *x,*xb;
1880: const MatScalar *v;
1881: PetscErrorCode ierr;
1882: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1883: PetscInt ncols,k;
1884: const PetscInt *ridx = NULL,*idx,*ii;
1885: PetscBool usecprow = a->compressedrow.use;
1888: VecCopy(yy,zz);
1889: VecGetArrayRead(xx,&x);
1890: VecGetArray(zz,&zarray);
1892: idx = a->j;
1893: v = a->a;
1894: if (usecprow) {
1895: mbs = a->compressedrow.nrows;
1896: ii = a->compressedrow.i;
1897: ridx = a->compressedrow.rindex;
1898: } else {
1899: mbs = a->mbs;
1900: ii = a->i;
1901: z = zarray;
1902: }
1904: if (!a->mult_work) {
1905: k = PetscMax(A->rmap->n,A->cmap->n);
1906: PetscMalloc1(k+1,&a->mult_work);
1907: }
1908: work = a->mult_work;
1909: for (i=0; i<mbs; i++) {
1910: n = ii[1] - ii[0]; ii++;
1911: ncols = n*bs;
1912: workt = work;
1913: for (j=0; j<n; j++) {
1914: xb = x + bs*(*idx++);
1915: for (k=0; k<bs; k++) workt[k] = xb[k];
1916: workt += bs;
1917: }
1918: if (usecprow) z = zarray + bs*ridx[i];
1919: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1920: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1921: v += n*bs2;
1922: if (!usecprow) z += bs;
1923: }
1924: VecRestoreArrayRead(xx,&x);
1925: VecRestoreArray(zz,&zarray);
1926: PetscLogFlops(2.0*a->nz*bs2);
1927: return(0);
1928: }
1930: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1931: {
1932: PetscScalar zero = 0.0;
1936: VecSet(zz,zero);
1937: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1938: return(0);
1939: }
1941: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1942: {
1943: PetscScalar zero = 0.0;
1947: VecSet(zz,zero);
1948: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1949: return(0);
1950: }
1952: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1953: {
1954: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1955: PetscScalar *z,x1,x2,x3,x4,x5;
1956: const PetscScalar *x,*xb = NULL;
1957: const MatScalar *v;
1958: PetscErrorCode ierr;
1959: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1960: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1961: Mat_CompressedRow cprow = a->compressedrow;
1962: PetscBool usecprow = cprow.use;
1965: if (yy != zz) { VecCopy(yy,zz); }
1966: VecGetArrayRead(xx,&x);
1967: VecGetArray(zz,&z);
1969: idx = a->j;
1970: v = a->a;
1971: if (usecprow) {
1972: mbs = cprow.nrows;
1973: ii = cprow.i;
1974: ridx = cprow.rindex;
1975: } else {
1976: mbs=a->mbs;
1977: ii = a->i;
1978: xb = x;
1979: }
1981: switch (bs) {
1982: case 1:
1983: for (i=0; i<mbs; i++) {
1984: if (usecprow) xb = x + ridx[i];
1985: x1 = xb[0];
1986: ib = idx + ii[0];
1987: n = ii[1] - ii[0]; ii++;
1988: for (j=0; j<n; j++) {
1989: rval = ib[j];
1990: z[rval] += PetscConj(*v) * x1;
1991: v++;
1992: }
1993: if (!usecprow) xb++;
1994: }
1995: break;
1996: case 2:
1997: for (i=0; i<mbs; i++) {
1998: if (usecprow) xb = x + 2*ridx[i];
1999: x1 = xb[0]; x2 = xb[1];
2000: ib = idx + ii[0];
2001: n = ii[1] - ii[0]; ii++;
2002: for (j=0; j<n; j++) {
2003: rval = ib[j]*2;
2004: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2005: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2006: v += 4;
2007: }
2008: if (!usecprow) xb += 2;
2009: }
2010: break;
2011: case 3:
2012: for (i=0; i<mbs; i++) {
2013: if (usecprow) xb = x + 3*ridx[i];
2014: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2015: ib = idx + ii[0];
2016: n = ii[1] - ii[0]; ii++;
2017: for (j=0; j<n; j++) {
2018: rval = ib[j]*3;
2019: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2020: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2021: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2022: v += 9;
2023: }
2024: if (!usecprow) xb += 3;
2025: }
2026: break;
2027: case 4:
2028: for (i=0; i<mbs; i++) {
2029: if (usecprow) xb = x + 4*ridx[i];
2030: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2031: ib = idx + ii[0];
2032: n = ii[1] - ii[0]; ii++;
2033: for (j=0; j<n; j++) {
2034: rval = ib[j]*4;
2035: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2036: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2037: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2038: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2039: v += 16;
2040: }
2041: if (!usecprow) xb += 4;
2042: }
2043: break;
2044: case 5:
2045: for (i=0; i<mbs; i++) {
2046: if (usecprow) xb = x + 5*ridx[i];
2047: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2048: x4 = xb[3]; x5 = xb[4];
2049: ib = idx + ii[0];
2050: n = ii[1] - ii[0]; ii++;
2051: for (j=0; j<n; j++) {
2052: rval = ib[j]*5;
2053: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2054: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2055: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2056: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2057: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2058: v += 25;
2059: }
2060: if (!usecprow) xb += 5;
2061: }
2062: break;
2063: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2064: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2065: #if 0
2066: {
2067: PetscInt ncols,k,bs2=a->bs2;
2068: PetscScalar *work,*workt,zb;
2069: const PetscScalar *xtmp;
2070: if (!a->mult_work) {
2071: k = PetscMax(A->rmap->n,A->cmap->n);
2072: PetscMalloc1(k+1,&a->mult_work);
2073: }
2074: work = a->mult_work;
2075: xtmp = x;
2076: for (i=0; i<mbs; i++) {
2077: n = ii[1] - ii[0]; ii++;
2078: ncols = n*bs;
2079: PetscMemzero(work,ncols*sizeof(PetscScalar));
2080: if (usecprow) xtmp = x + bs*ridx[i];
2081: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2082: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2083: v += n*bs2;
2084: if (!usecprow) xtmp += bs;
2085: workt = work;
2086: for (j=0; j<n; j++) {
2087: zb = z + bs*(*idx++);
2088: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2089: workt += bs;
2090: }
2091: }
2092: }
2093: #endif
2094: }
2095: VecRestoreArrayRead(xx,&x);
2096: VecRestoreArray(zz,&z);
2097: PetscLogFlops(2.0*a->nz*a->bs2);
2098: return(0);
2099: }
2101: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2102: {
2103: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2104: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2105: const PetscScalar *x,*xb = 0;
2106: const MatScalar *v;
2107: PetscErrorCode ierr;
2108: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2109: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2110: Mat_CompressedRow cprow = a->compressedrow;
2111: PetscBool usecprow=cprow.use;
2114: if (yy != zz) { VecCopy(yy,zz); }
2115: VecGetArrayRead(xx,&x);
2116: VecGetArray(zz,&z);
2118: idx = a->j;
2119: v = a->a;
2120: if (usecprow) {
2121: mbs = cprow.nrows;
2122: ii = cprow.i;
2123: ridx = cprow.rindex;
2124: } else {
2125: mbs=a->mbs;
2126: ii = a->i;
2127: xb = x;
2128: }
2130: switch (bs) {
2131: case 1:
2132: for (i=0; i<mbs; i++) {
2133: if (usecprow) xb = x + ridx[i];
2134: x1 = xb[0];
2135: ib = idx + ii[0];
2136: n = ii[1] - ii[0]; ii++;
2137: for (j=0; j<n; j++) {
2138: rval = ib[j];
2139: z[rval] += *v * x1;
2140: v++;
2141: }
2142: if (!usecprow) xb++;
2143: }
2144: break;
2145: case 2:
2146: for (i=0; i<mbs; i++) {
2147: if (usecprow) xb = x + 2*ridx[i];
2148: x1 = xb[0]; x2 = xb[1];
2149: ib = idx + ii[0];
2150: n = ii[1] - ii[0]; ii++;
2151: for (j=0; j<n; j++) {
2152: rval = ib[j]*2;
2153: z[rval++] += v[0]*x1 + v[1]*x2;
2154: z[rval++] += v[2]*x1 + v[3]*x2;
2155: v += 4;
2156: }
2157: if (!usecprow) xb += 2;
2158: }
2159: break;
2160: case 3:
2161: for (i=0; i<mbs; i++) {
2162: if (usecprow) xb = x + 3*ridx[i];
2163: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2164: ib = idx + ii[0];
2165: n = ii[1] - ii[0]; ii++;
2166: for (j=0; j<n; j++) {
2167: rval = ib[j]*3;
2168: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2169: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2170: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2171: v += 9;
2172: }
2173: if (!usecprow) xb += 3;
2174: }
2175: break;
2176: case 4:
2177: for (i=0; i<mbs; i++) {
2178: if (usecprow) xb = x + 4*ridx[i];
2179: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2180: ib = idx + ii[0];
2181: n = ii[1] - ii[0]; ii++;
2182: for (j=0; j<n; j++) {
2183: rval = ib[j]*4;
2184: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2185: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2186: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2187: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2188: v += 16;
2189: }
2190: if (!usecprow) xb += 4;
2191: }
2192: break;
2193: case 5:
2194: for (i=0; i<mbs; i++) {
2195: if (usecprow) xb = x + 5*ridx[i];
2196: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2197: x4 = xb[3]; x5 = xb[4];
2198: ib = idx + ii[0];
2199: n = ii[1] - ii[0]; ii++;
2200: for (j=0; j<n; j++) {
2201: rval = ib[j]*5;
2202: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2203: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2204: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2205: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2206: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2207: v += 25;
2208: }
2209: if (!usecprow) xb += 5;
2210: }
2211: break;
2212: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2213: PetscInt ncols,k;
2214: PetscScalar *work,*workt;
2215: const PetscScalar *xtmp;
2216: if (!a->mult_work) {
2217: k = PetscMax(A->rmap->n,A->cmap->n);
2218: PetscMalloc1(k+1,&a->mult_work);
2219: }
2220: work = a->mult_work;
2221: xtmp = x;
2222: for (i=0; i<mbs; i++) {
2223: n = ii[1] - ii[0]; ii++;
2224: ncols = n*bs;
2225: PetscMemzero(work,ncols*sizeof(PetscScalar));
2226: if (usecprow) xtmp = x + bs*ridx[i];
2227: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2228: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2229: v += n*bs2;
2230: if (!usecprow) xtmp += bs;
2231: workt = work;
2232: for (j=0; j<n; j++) {
2233: zb = z + bs*(*idx++);
2234: for (k=0; k<bs; k++) zb[k] += workt[k];
2235: workt += bs;
2236: }
2237: }
2238: }
2239: }
2240: VecRestoreArrayRead(xx,&x);
2241: VecRestoreArray(zz,&z);
2242: PetscLogFlops(2.0*a->nz*a->bs2);
2243: return(0);
2244: }
2246: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2247: {
2248: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2249: PetscInt totalnz = a->bs2*a->nz;
2250: PetscScalar oalpha = alpha;
2252: PetscBLASInt one = 1,tnz;
2255: PetscBLASIntCast(totalnz,&tnz);
2256: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2257: PetscLogFlops(totalnz);
2258: return(0);
2259: }
2261: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2262: {
2264: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2265: MatScalar *v = a->a;
2266: PetscReal sum = 0.0;
2267: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2270: if (type == NORM_FROBENIUS) {
2271: #if defined(PETSC_USE_REAL___FP16)
2272: PetscBLASInt one = 1,cnt = bs2*nz;
2273: *norm = BLASnrm2_(&cnt,v,&one);
2274: #else
2275: for (i=0; i<bs2*nz; i++) {
2276: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2277: }
2278: #endif
2279: *norm = PetscSqrtReal(sum);
2280: PetscLogFlops(2*bs2*nz);
2281: } else if (type == NORM_1) { /* maximum column sum */
2282: PetscReal *tmp;
2283: PetscInt *bcol = a->j;
2284: PetscCalloc1(A->cmap->n+1,&tmp);
2285: for (i=0; i<nz; i++) {
2286: for (j=0; j<bs; j++) {
2287: k1 = bs*(*bcol) + j; /* column index */
2288: for (k=0; k<bs; k++) {
2289: tmp[k1] += PetscAbsScalar(*v); v++;
2290: }
2291: }
2292: bcol++;
2293: }
2294: *norm = 0.0;
2295: for (j=0; j<A->cmap->n; j++) {
2296: if (tmp[j] > *norm) *norm = tmp[j];
2297: }
2298: PetscFree(tmp);
2299: PetscLogFlops(PetscMax(bs2*nz-1,0));
2300: } else if (type == NORM_INFINITY) { /* maximum row sum */
2301: *norm = 0.0;
2302: for (k=0; k<bs; k++) {
2303: for (j=0; j<a->mbs; j++) {
2304: v = a->a + bs2*a->i[j] + k;
2305: sum = 0.0;
2306: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2307: for (k1=0; k1<bs; k1++) {
2308: sum += PetscAbsScalar(*v);
2309: v += bs;
2310: }
2311: }
2312: if (sum > *norm) *norm = sum;
2313: }
2314: }
2315: PetscLogFlops(PetscMax(bs2*nz-1,0));
2316: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2317: return(0);
2318: }
2321: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2322: {
2323: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2327: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2328: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2329: *flg = PETSC_FALSE;
2330: return(0);
2331: }
2333: /* if the a->i are the same */
2334: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2335: if (!*flg) return(0);
2337: /* if a->j are the same */
2338: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2339: if (!*flg) return(0);
2341: /* if a->a are the same */
2342: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2343: return(0);
2345: }
2347: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2348: {
2349: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2351: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2352: PetscScalar *x,zero = 0.0;
2353: MatScalar *aa,*aa_j;
2356: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2357: bs = A->rmap->bs;
2358: aa = a->a;
2359: ai = a->i;
2360: aj = a->j;
2361: ambs = a->mbs;
2362: bs2 = a->bs2;
2364: VecSet(v,zero);
2365: VecGetArray(v,&x);
2366: VecGetLocalSize(v,&n);
2367: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2368: for (i=0; i<ambs; i++) {
2369: for (j=ai[i]; j<ai[i+1]; j++) {
2370: if (aj[j] == i) {
2371: row = i*bs;
2372: aa_j = aa+j*bs2;
2373: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2374: break;
2375: }
2376: }
2377: }
2378: VecRestoreArray(v,&x);
2379: return(0);
2380: }
2382: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2383: {
2384: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2385: const PetscScalar *l,*r,*li,*ri;
2386: PetscScalar x;
2387: MatScalar *aa, *v;
2388: PetscErrorCode ierr;
2389: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2390: const PetscInt *ai,*aj;
2393: ai = a->i;
2394: aj = a->j;
2395: aa = a->a;
2396: m = A->rmap->n;
2397: n = A->cmap->n;
2398: bs = A->rmap->bs;
2399: mbs = a->mbs;
2400: bs2 = a->bs2;
2401: if (ll) {
2402: VecGetArrayRead(ll,&l);
2403: VecGetLocalSize(ll,&lm);
2404: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2405: for (i=0; i<mbs; i++) { /* for each block row */
2406: M = ai[i+1] - ai[i];
2407: li = l + i*bs;
2408: v = aa + bs2*ai[i];
2409: for (j=0; j<M; j++) { /* for each block */
2410: for (k=0; k<bs2; k++) {
2411: (*v++) *= li[k%bs];
2412: }
2413: }
2414: }
2415: VecRestoreArrayRead(ll,&l);
2416: PetscLogFlops(a->nz);
2417: }
2419: if (rr) {
2420: VecGetArrayRead(rr,&r);
2421: VecGetLocalSize(rr,&rn);
2422: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2423: for (i=0; i<mbs; i++) { /* for each block row */
2424: iai = ai[i];
2425: M = ai[i+1] - iai;
2426: v = aa + bs2*iai;
2427: for (j=0; j<M; j++) { /* for each block */
2428: ri = r + bs*aj[iai+j];
2429: for (k=0; k<bs; k++) {
2430: x = ri[k];
2431: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2432: v += bs;
2433: }
2434: }
2435: }
2436: VecRestoreArrayRead(rr,&r);
2437: PetscLogFlops(a->nz);
2438: }
2439: return(0);
2440: }
2443: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2444: {
2445: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2448: info->block_size = a->bs2;
2449: info->nz_allocated = a->bs2*a->maxnz;
2450: info->nz_used = a->bs2*a->nz;
2451: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2452: info->assemblies = A->num_ass;
2453: info->mallocs = A->info.mallocs;
2454: info->memory = ((PetscObject)A)->mem;
2455: if (A->factortype) {
2456: info->fill_ratio_given = A->info.fill_ratio_given;
2457: info->fill_ratio_needed = A->info.fill_ratio_needed;
2458: info->factor_mallocs = A->info.factor_mallocs;
2459: } else {
2460: info->fill_ratio_given = 0;
2461: info->fill_ratio_needed = 0;
2462: info->factor_mallocs = 0;
2463: }
2464: return(0);
2465: }
2467: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2468: {
2469: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2473: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2474: return(0);
2475: }