Actual source code: baij2.c
petsc-3.9.4 2018-09-11
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
15: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
16: const PetscInt *idx;
17: PetscInt start,end,*ai,*aj,bs,*nidx2;
18: PetscBT table;
21: m = a->mbs;
22: ai = a->i;
23: aj = a->j;
24: bs = A->rmap->bs;
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28: PetscBTCreate(m,&table);
29: PetscMalloc1(m+1,&nidx);
30: PetscMalloc1(A->rmap->N+1,&nidx2);
32: for (i=0; i<is_max; i++) {
33: /* Initialise the two local arrays */
34: isz = 0;
35: PetscBTMemzero(m,table);
37: /* Extract the indices, assume there can be duplicate entries */
38: ISGetIndices(is[i],&idx);
39: ISGetLocalSize(is[i],&n);
41: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42: for (j=0; j<n; ++j) {
43: ival = idx[j]/bs; /* convert the indices into block indices */
44: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(&is[i]);
50: k = 0;
51: for (j=0; j<ov; j++) { /* for each overlap*/
52: n = isz;
53: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
54: row = nidx[k];
55: start = ai[row];
56: end = ai[row+1];
57: for (l = start; l<end; l++) {
58: val = aj[l];
59: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60: }
61: }
62: }
63: /* expand the Index Set */
64: for (j=0; j<isz; j++) {
65: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66: }
67: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
68: }
69: PetscBTDestroy(&table);
70: PetscFree(nidx);
71: PetscFree(nidx2);
72: return(0);
73: }
75: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
89: ISGetIndices(isrow,&irow);
90: ISGetIndices(iscol,&icol);
91: ISGetLocalSize(isrow,&nrows);
92: ISGetLocalSize(iscol,&ncols);
94: PetscCalloc1(1+oldcols,&smap);
95: ssmap = smap;
96: PetscMalloc1(1+nrows,&lens);
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) lens[i]++;
105: }
106: }
107: /* Create and fill new matrix */
108: if (scall == MAT_REUSE_MATRIX) {
109: c = (Mat_SeqBAIJ*)((*B)->data);
111: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
113: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
115: C = *B;
116: } else {
117: MatCreate(PetscObjectComm((PetscObject)A),&C);
118: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119: MatSetType(C,((PetscObject)A)->type_name);
120: MatSeqBAIJSetPreallocation(C,bs,0,lens);
121: }
122: c = (Mat_SeqBAIJ*)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140: /* sort */
141: {
142: MatScalar *work;
143: PetscMalloc1(bs2,&work);
144: for (i=0; i<nrows; i++) {
145: PetscInt ilen;
146: mat_i = c->i[i];
147: mat_j = c->j + mat_i;
148: mat_a = c->a + mat_i*bs2;
149: ilen = c->ilen[i];
150: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151: }
152: PetscFree(work);
153: }
155: /* Free work space */
156: ISRestoreIndices(iscol,&icol);
157: PetscFree(smap);
158: PetscFree(lens);
159: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
162: ISRestoreIndices(isrow,&irow);
163: *B = C;
164: return(0);
165: }
167: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168: {
169: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
170: IS is1,is2;
172: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173: const PetscInt *irow,*icol;
176: ISGetIndices(isrow,&irow);
177: ISGetIndices(iscol,&icol);
178: ISGetLocalSize(isrow,&nrows);
179: ISGetLocalSize(iscol,&ncols);
181: /* Verify if the indices corespond to each element in a block
182: and form the IS with compressed IS */
183: maxmnbs = PetscMax(a->mbs,a->nbs);
184: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
185: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
186: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187: for (i=0; i<a->mbs; i++) {
188: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189: }
190: count = 0;
191: for (i=0; i<nrows; i++) {
192: j = irow[i] / bs;
193: if ((vary[j]--)==bs) iary[count++] = j;
194: }
195: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
197: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
198: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199: for (i=0; i<a->nbs; i++) {
200: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201: }
202: count = 0;
203: for (i=0; i<ncols; i++) {
204: j = icol[i] / bs;
205: if ((vary[j]--)==bs) iary[count++] = j;
206: }
207: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
208: ISRestoreIndices(isrow,&irow);
209: ISRestoreIndices(iscol,&icol);
210: PetscFree2(vary,iary);
212: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
213: ISDestroy(&is1);
214: ISDestroy(&is2);
215: return(0);
216: }
218: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219: {
221: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
222: Mat_SubSppt *submatj = c->submatis1;
225: submatj->destroy(C);
226: MatDestroySubMatrix_Private(submatj);
227: return(0);
228: }
230: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231: {
233: PetscInt i;
234: Mat C;
235: Mat_SeqBAIJ *c;
236: Mat_SubSppt *submatj;
239: for (i=0; i<n; i++) {
240: C = (*mat)[i];
241: c = (Mat_SeqBAIJ*)C->data;
242: submatj = c->submatis1;
243: if (submatj) {
244: submatj->destroy(C);
245: MatDestroySubMatrix_Private(submatj);
246: PetscLayoutDestroy(&C->rmap);
247: PetscLayoutDestroy(&C->cmap);
248: PetscHeaderDestroy(&C);
249: } else {
250: MatDestroy(&C);
251: }
252: }
253: PetscFree(*mat);
254: return(0);
255: }
257: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
258: {
260: PetscInt i;
263: if (scall == MAT_INITIAL_MATRIX) {
264: PetscCalloc1(n+1,B);
265: }
267: for (i=0; i<n; i++) {
268: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
269: }
270: return(0);
271: }
273: /* -------------------------------------------------------*/
274: /* Should check that shapes of vectors and matrices match */
275: /* -------------------------------------------------------*/
277: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
278: {
279: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
280: PetscScalar *z,sum;
281: const PetscScalar *x;
282: const MatScalar *v;
283: PetscErrorCode ierr;
284: PetscInt mbs,i,n;
285: const PetscInt *idx,*ii,*ridx=NULL;
286: PetscBool usecprow=a->compressedrow.use;
289: VecGetArrayRead(xx,&x);
290: VecGetArray(zz,&z);
292: if (usecprow) {
293: mbs = a->compressedrow.nrows;
294: ii = a->compressedrow.i;
295: ridx = a->compressedrow.rindex;
296: PetscMemzero(z,a->mbs*sizeof(PetscScalar));
297: } else {
298: mbs = a->mbs;
299: ii = a->i;
300: }
302: for (i=0; i<mbs; i++) {
303: n = ii[1] - ii[0];
304: v = a->a + ii[0];
305: idx = a->j + ii[0];
306: ii++;
307: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
308: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
309: sum = 0.0;
310: PetscSparseDensePlusDot(sum,x,v,idx,n);
311: if (usecprow) {
312: z[ridx[i]] = sum;
313: } else {
314: z[i] = sum;
315: }
316: }
317: VecRestoreArrayRead(xx,&x);
318: VecRestoreArray(zz,&z);
319: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
320: return(0);
321: }
323: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
324: {
325: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
326: PetscScalar *z = 0,sum1,sum2,*zarray;
327: const PetscScalar *x,*xb;
328: PetscScalar x1,x2;
329: const MatScalar *v;
330: PetscErrorCode ierr;
331: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
332: PetscBool usecprow=a->compressedrow.use;
335: VecGetArrayRead(xx,&x);
336: VecGetArray(zz,&zarray);
338: idx = a->j;
339: v = a->a;
340: if (usecprow) {
341: mbs = a->compressedrow.nrows;
342: ii = a->compressedrow.i;
343: ridx = a->compressedrow.rindex;
344: PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
345: } else {
346: mbs = a->mbs;
347: ii = a->i;
348: z = zarray;
349: }
351: for (i=0; i<mbs; i++) {
352: n = ii[1] - ii[0]; ii++;
353: sum1 = 0.0; sum2 = 0.0;
354: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356: for (j=0; j<n; j++) {
357: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
358: sum1 += v[0]*x1 + v[2]*x2;
359: sum2 += v[1]*x1 + v[3]*x2;
360: v += 4;
361: }
362: if (usecprow) z = zarray + 2*ridx[i];
363: z[0] = sum1; z[1] = sum2;
364: if (!usecprow) z += 2;
365: }
366: VecRestoreArrayRead(xx,&x);
367: VecRestoreArray(zz,&zarray);
368: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
369: return(0);
370: }
372: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
373: {
374: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
375: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
376: const PetscScalar *x,*xb;
377: const MatScalar *v;
378: PetscErrorCode ierr;
379: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
380: PetscBool usecprow=a->compressedrow.use;
382: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383: #pragma disjoint(*v,*z,*xb)
384: #endif
387: VecGetArrayRead(xx,&x);
388: VecGetArray(zz,&zarray);
390: idx = a->j;
391: v = a->a;
392: if (usecprow) {
393: mbs = a->compressedrow.nrows;
394: ii = a->compressedrow.i;
395: ridx = a->compressedrow.rindex;
396: PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
397: } else {
398: mbs = a->mbs;
399: ii = a->i;
400: z = zarray;
401: }
403: for (i=0; i<mbs; i++) {
404: n = ii[1] - ii[0]; ii++;
405: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
406: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
407: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408: for (j=0; j<n; j++) {
409: xb = x + 3*(*idx++);
410: x1 = xb[0];
411: x2 = xb[1];
412: x3 = xb[2];
414: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
415: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
416: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
417: v += 9;
418: }
419: if (usecprow) z = zarray + 3*ridx[i];
420: z[0] = sum1; z[1] = sum2; z[2] = sum3;
421: if (!usecprow) z += 3;
422: }
423: VecRestoreArrayRead(xx,&x);
424: VecRestoreArray(zz,&zarray);
425: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
426: return(0);
427: }
429: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
430: {
431: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
432: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
433: const PetscScalar *x,*xb;
434: const MatScalar *v;
435: PetscErrorCode ierr;
436: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
437: PetscBool usecprow=a->compressedrow.use;
440: VecGetArrayRead(xx,&x);
441: VecGetArray(zz,&zarray);
443: idx = a->j;
444: v = a->a;
445: if (usecprow) {
446: mbs = a->compressedrow.nrows;
447: ii = a->compressedrow.i;
448: ridx = a->compressedrow.rindex;
449: PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
450: } else {
451: mbs = a->mbs;
452: ii = a->i;
453: z = zarray;
454: }
456: for (i=0; i<mbs; i++) {
457: n = ii[1] - ii[0];
458: ii++;
459: sum1 = 0.0;
460: sum2 = 0.0;
461: sum3 = 0.0;
462: sum4 = 0.0;
464: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
465: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
466: for (j=0; j<n; j++) {
467: xb = x + 4*(*idx++);
468: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
469: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
470: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
471: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
472: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
473: v += 16;
474: }
475: if (usecprow) z = zarray + 4*ridx[i];
476: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
477: if (!usecprow) z += 4;
478: }
479: VecRestoreArrayRead(xx,&x);
480: VecRestoreArray(zz,&zarray);
481: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
482: return(0);
483: }
485: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
486: {
487: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
488: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
489: const PetscScalar *xb,*x;
490: const MatScalar *v;
491: PetscErrorCode ierr;
492: const PetscInt *idx,*ii,*ridx=NULL;
493: PetscInt mbs,i,j,n;
494: PetscBool usecprow=a->compressedrow.use;
497: VecGetArrayRead(xx,&x);
498: VecGetArray(zz,&zarray);
500: idx = a->j;
501: v = a->a;
502: if (usecprow) {
503: mbs = a->compressedrow.nrows;
504: ii = a->compressedrow.i;
505: ridx = a->compressedrow.rindex;
506: PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
507: } else {
508: mbs = a->mbs;
509: ii = a->i;
510: z = zarray;
511: }
513: for (i=0; i<mbs; i++) {
514: n = ii[1] - ii[0]; ii++;
515: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
516: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
517: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
518: for (j=0; j<n; j++) {
519: xb = x + 5*(*idx++);
520: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
521: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
522: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
523: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
524: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
525: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
526: v += 25;
527: }
528: if (usecprow) z = zarray + 5*ridx[i];
529: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
530: if (!usecprow) z += 5;
531: }
532: VecRestoreArrayRead(xx,&x);
533: VecRestoreArray(zz,&zarray);
534: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
535: return(0);
536: }
540: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541: {
542: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
543: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
544: const PetscScalar *x,*xb;
545: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
546: const MatScalar *v;
547: PetscErrorCode ierr;
548: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
549: PetscBool usecprow=a->compressedrow.use;
552: VecGetArrayRead(xx,&x);
553: VecGetArray(zz,&zarray);
555: idx = a->j;
556: v = a->a;
557: if (usecprow) {
558: mbs = a->compressedrow.nrows;
559: ii = a->compressedrow.i;
560: ridx = a->compressedrow.rindex;
561: PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
562: } else {
563: mbs = a->mbs;
564: ii = a->i;
565: z = zarray;
566: }
568: for (i=0; i<mbs; i++) {
569: n = ii[1] - ii[0];
570: ii++;
571: sum1 = 0.0;
572: sum2 = 0.0;
573: sum3 = 0.0;
574: sum4 = 0.0;
575: sum5 = 0.0;
576: sum6 = 0.0;
578: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580: for (j=0; j<n; j++) {
581: xb = x + 6*(*idx++);
582: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589: v += 36;
590: }
591: if (usecprow) z = zarray + 6*ridx[i];
592: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593: if (!usecprow) z += 6;
594: }
596: VecRestoreArrayRead(xx,&x);
597: VecRestoreArray(zz,&zarray);
598: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
599: return(0);
600: }
602: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603: {
604: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
605: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606: const PetscScalar *x,*xb;
607: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
608: const MatScalar *v;
609: PetscErrorCode ierr;
610: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
611: PetscBool usecprow=a->compressedrow.use;
614: VecGetArrayRead(xx,&x);
615: VecGetArray(zz,&zarray);
617: idx = a->j;
618: v = a->a;
619: if (usecprow) {
620: mbs = a->compressedrow.nrows;
621: ii = a->compressedrow.i;
622: ridx = a->compressedrow.rindex;
623: PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
624: } else {
625: mbs = a->mbs;
626: ii = a->i;
627: z = zarray;
628: }
630: for (i=0; i<mbs; i++) {
631: n = ii[1] - ii[0];
632: ii++;
633: sum1 = 0.0;
634: sum2 = 0.0;
635: sum3 = 0.0;
636: sum4 = 0.0;
637: sum5 = 0.0;
638: sum6 = 0.0;
639: sum7 = 0.0;
641: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
642: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643: for (j=0; j<n; j++) {
644: xb = x + 7*(*idx++);
645: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653: v += 49;
654: }
655: if (usecprow) z = zarray + 7*ridx[i];
656: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657: if (!usecprow) z += 7;
658: }
660: VecRestoreArrayRead(xx,&x);
661: VecRestoreArray(zz,&zarray);
662: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
663: return(0);
664: }
666: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668: {
669: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
670: PetscScalar *z = 0,*work,*workt,*zarray;
671: const PetscScalar *x,*xb;
672: const MatScalar *v;
674: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675: const PetscInt *idx,*ii,*ridx=NULL;
676: PetscInt k;
677: PetscBool usecprow=a->compressedrow.use;
679: __m256d a0,a1,a2,a3,a4,a5;
680: __m256d w0,w1,w2,w3;
681: __m256d z0,z1,z2;
682: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
685: VecGetArrayRead(xx,&x);
686: VecGetArray(zz,&zarray);
688: idx = a->j;
689: v = a->a;
690: if (usecprow) {
691: mbs = a->compressedrow.nrows;
692: ii = a->compressedrow.i;
693: ridx = a->compressedrow.rindex;
694: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
695: } else {
696: mbs = a->mbs;
697: ii = a->i;
698: z = zarray;
699: }
701: if (!a->mult_work) {
702: k = PetscMax(A->rmap->n,A->cmap->n);
703: PetscMalloc1(k+1,&a->mult_work);
704: }
706: work = a->mult_work;
707: for (i=0; i<mbs; i++) {
708: n = ii[1] - ii[0]; ii++;
709: workt = work;
710: for (j=0; j<n; j++) {
711: xb = x + bs*(*idx++);
712: for (k=0; k<bs; k++) workt[k] = xb[k];
713: workt += bs;
714: }
715: if (usecprow) z = zarray + bs*ridx[i];
717: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
719: for (j=0; j<n; j++) {
720: // first column of a
721: w0 = _mm256_set1_pd(work[j*9 ]);
722: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
726: // second column of a
727: w1 = _mm256_set1_pd(work[j*9+ 1]);
728: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
732: // third column of a
733: w2 = _mm256_set1_pd(work[j*9 +2]);
734: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
738: // fourth column of a
739: w3 = _mm256_set1_pd(work[j*9+ 3]);
740: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
744: // fifth column of a
745: w0 = _mm256_set1_pd(work[j*9+ 4]);
746: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
750: // sixth column of a
751: w1 = _mm256_set1_pd(work[j*9+ 5]);
752: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
756: // seventh column of a
757: w2 = _mm256_set1_pd(work[j*9+ 6]);
758: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
762: // eigth column of a
763: w3 = _mm256_set1_pd(work[j*9+ 7]);
764: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
768: // ninth column of a
769: w0 = _mm256_set1_pd(work[j*9+ 8]);
770: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773: }
775: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
777: v += n*bs2;
778: if (!usecprow) z += bs;
779: }
780: VecRestoreArrayRead(xx,&x);
781: VecRestoreArray(zz,&zarray);
782: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
783: return(0);
784: }
785: #endif
787: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788: {
789: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
790: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791: const PetscScalar *x,*xb;
792: PetscScalar *zarray,xv;
793: const MatScalar *v;
794: PetscErrorCode ierr;
795: const PetscInt *ii,*ij=a->j,*idx;
796: PetscInt mbs,i,j,k,n,*ridx=NULL;
797: PetscBool usecprow=a->compressedrow.use;
800: VecGetArrayRead(xx,&x);
801: VecGetArray(zz,&zarray);
803: v = a->a;
804: if (usecprow) {
805: mbs = a->compressedrow.nrows;
806: ii = a->compressedrow.i;
807: ridx = a->compressedrow.rindex;
808: PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));
809: } else {
810: mbs = a->mbs;
811: ii = a->i;
812: z = zarray;
813: }
815: for (i=0; i<mbs; i++) {
816: n = ii[i+1] - ii[i];
817: idx = ij + ii[i];
818: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
821: for (j=0; j<n; j++) {
822: xb = x + 11*(idx[j]);
824: for (k=0; k<11; k++) {
825: xv = xb[k];
826: sum1 += v[0]*xv;
827: sum2 += v[1]*xv;
828: sum3 += v[2]*xv;
829: sum4 += v[3]*xv;
830: sum5 += v[4]*xv;
831: sum6 += v[5]*xv;
832: sum7 += v[6]*xv;
833: sum8 += v[7]*xv;
834: sum9 += v[8]*xv;
835: sum10 += v[9]*xv;
836: sum11 += v[10]*xv;
837: v += 11;
838: }
839: }
840: if (usecprow) z = zarray + 11*ridx[i];
841: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
844: if (!usecprow) z += 11;
845: }
847: VecRestoreArrayRead(xx,&x);
848: VecRestoreArray(zz,&zarray);
849: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
850: return(0);
851: }
853: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854: /* Default MatMult for block size 15 */
856: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
857: {
858: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
859: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
860: const PetscScalar *x,*xb;
861: PetscScalar *zarray,xv;
862: const MatScalar *v;
863: PetscErrorCode ierr;
864: const PetscInt *ii,*ij=a->j,*idx;
865: PetscInt mbs,i,j,k,n,*ridx=NULL;
866: PetscBool usecprow=a->compressedrow.use;
869: VecGetArrayRead(xx,&x);
870: VecGetArray(zz,&zarray);
872: v = a->a;
873: if (usecprow) {
874: mbs = a->compressedrow.nrows;
875: ii = a->compressedrow.i;
876: ridx = a->compressedrow.rindex;
877: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
878: } else {
879: mbs = a->mbs;
880: ii = a->i;
881: z = zarray;
882: }
884: for (i=0; i<mbs; i++) {
885: n = ii[i+1] - ii[i];
886: idx = ij + ii[i];
887: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
888: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
890: for (j=0; j<n; j++) {
891: xb = x + 15*(idx[j]);
893: for (k=0; k<15; k++) {
894: xv = xb[k];
895: sum1 += v[0]*xv;
896: sum2 += v[1]*xv;
897: sum3 += v[2]*xv;
898: sum4 += v[3]*xv;
899: sum5 += v[4]*xv;
900: sum6 += v[5]*xv;
901: sum7 += v[6]*xv;
902: sum8 += v[7]*xv;
903: sum9 += v[8]*xv;
904: sum10 += v[9]*xv;
905: sum11 += v[10]*xv;
906: sum12 += v[11]*xv;
907: sum13 += v[12]*xv;
908: sum14 += v[13]*xv;
909: sum15 += v[14]*xv;
910: v += 15;
911: }
912: }
913: if (usecprow) z = zarray + 15*ridx[i];
914: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
917: if (!usecprow) z += 15;
918: }
920: VecRestoreArrayRead(xx,&x);
921: VecRestoreArray(zz,&zarray);
922: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
923: return(0);
924: }
926: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
927: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
928: {
929: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
930: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931: const PetscScalar *x,*xb;
932: PetscScalar x1,x2,x3,x4,*zarray;
933: const MatScalar *v;
934: PetscErrorCode ierr;
935: const PetscInt *ii,*ij=a->j,*idx;
936: PetscInt mbs,i,j,n,*ridx=NULL;
937: PetscBool usecprow=a->compressedrow.use;
940: VecGetArrayRead(xx,&x);
941: VecGetArray(zz,&zarray);
943: v = a->a;
944: if (usecprow) {
945: mbs = a->compressedrow.nrows;
946: ii = a->compressedrow.i;
947: ridx = a->compressedrow.rindex;
948: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
949: } else {
950: mbs = a->mbs;
951: ii = a->i;
952: z = zarray;
953: }
955: for (i=0; i<mbs; i++) {
956: n = ii[i+1] - ii[i];
957: idx = ij + ii[i];
958: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
961: for (j=0; j<n; j++) {
962: xb = x + 15*(idx[j]);
963: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
965: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
966: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
967: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
968: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
969: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
970: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
971: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
972: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
973: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
974: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
975: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
976: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
977: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
978: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
979: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
981: v += 60;
983: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
985: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
986: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
987: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
988: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
989: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
990: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
991: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
992: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
993: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
994: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
995: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
996: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
997: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
998: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
999: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1000: v += 60;
1002: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1003: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1004: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1005: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1006: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1007: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1008: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1009: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1010: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1011: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1012: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1013: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1014: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1015: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1016: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1017: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1018: v += 60;
1020: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1021: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1022: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1023: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1024: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1025: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1026: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1027: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1028: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1029: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1030: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1031: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1032: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1033: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1034: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1035: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1036: v += 45;
1037: }
1038: if (usecprow) z = zarray + 15*ridx[i];
1039: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1040: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1042: if (!usecprow) z += 15;
1043: }
1045: VecRestoreArrayRead(xx,&x);
1046: VecRestoreArray(zz,&zarray);
1047: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1048: return(0);
1049: }
1051: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1052: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1053: {
1054: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1055: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1056: const PetscScalar *x,*xb;
1057: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1058: const MatScalar *v;
1059: PetscErrorCode ierr;
1060: const PetscInt *ii,*ij=a->j,*idx;
1061: PetscInt mbs,i,j,n,*ridx=NULL;
1062: PetscBool usecprow=a->compressedrow.use;
1065: VecGetArrayRead(xx,&x);
1066: VecGetArray(zz,&zarray);
1068: v = a->a;
1069: if (usecprow) {
1070: mbs = a->compressedrow.nrows;
1071: ii = a->compressedrow.i;
1072: ridx = a->compressedrow.rindex;
1073: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1074: } else {
1075: mbs = a->mbs;
1076: ii = a->i;
1077: z = zarray;
1078: }
1080: for (i=0; i<mbs; i++) {
1081: n = ii[i+1] - ii[i];
1082: idx = ij + ii[i];
1083: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1084: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1086: for (j=0; j<n; j++) {
1087: xb = x + 15*(idx[j]);
1088: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1089: x8 = xb[7];
1091: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1092: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1093: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1094: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1095: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1096: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1097: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1098: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1099: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1100: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1101: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1102: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1103: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1104: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1105: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1106: v += 120;
1108: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1110: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1111: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1112: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1113: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1114: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1115: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1116: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1117: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1118: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1119: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1120: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1121: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1122: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1123: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1124: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1125: v += 105;
1126: }
1127: if (usecprow) z = zarray + 15*ridx[i];
1128: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1129: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1131: if (!usecprow) z += 15;
1132: }
1134: VecRestoreArrayRead(xx,&x);
1135: VecRestoreArray(zz,&zarray);
1136: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1137: return(0);
1138: }
1140: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1142: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1143: {
1144: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1145: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1146: const PetscScalar *x,*xb;
1147: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1148: const MatScalar *v;
1149: PetscErrorCode ierr;
1150: const PetscInt *ii,*ij=a->j,*idx;
1151: PetscInt mbs,i,j,n,*ridx=NULL;
1152: PetscBool usecprow=a->compressedrow.use;
1155: VecGetArrayRead(xx,&x);
1156: VecGetArray(zz,&zarray);
1158: v = a->a;
1159: if (usecprow) {
1160: mbs = a->compressedrow.nrows;
1161: ii = a->compressedrow.i;
1162: ridx = a->compressedrow.rindex;
1163: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1164: } else {
1165: mbs = a->mbs;
1166: ii = a->i;
1167: z = zarray;
1168: }
1170: for (i=0; i<mbs; i++) {
1171: n = ii[i+1] - ii[i];
1172: idx = ij + ii[i];
1173: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1174: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1176: for (j=0; j<n; j++) {
1177: xb = x + 15*(idx[j]);
1178: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1179: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1181: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1182: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1183: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1184: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1185: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1186: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1187: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1188: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1189: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1190: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1191: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1192: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1193: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1194: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1195: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1196: v += 225;
1197: }
1198: if (usecprow) z = zarray + 15*ridx[i];
1199: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1200: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1202: if (!usecprow) z += 15;
1203: }
1205: VecRestoreArrayRead(xx,&x);
1206: VecRestoreArray(zz,&zarray);
1207: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1208: return(0);
1209: }
1212: /*
1213: This will not work with MatScalar == float because it calls the BLAS
1214: */
1215: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1216: {
1217: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1218: PetscScalar *z = 0,*work,*workt,*zarray;
1219: const PetscScalar *x,*xb;
1220: const MatScalar *v;
1221: PetscErrorCode ierr;
1222: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1223: const PetscInt *idx,*ii,*ridx=NULL;
1224: PetscInt ncols,k;
1225: PetscBool usecprow=a->compressedrow.use;
1228: VecGetArrayRead(xx,&x);
1229: VecGetArray(zz,&zarray);
1231: idx = a->j;
1232: v = a->a;
1233: if (usecprow) {
1234: mbs = a->compressedrow.nrows;
1235: ii = a->compressedrow.i;
1236: ridx = a->compressedrow.rindex;
1237: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1238: } else {
1239: mbs = a->mbs;
1240: ii = a->i;
1241: z = zarray;
1242: }
1244: if (!a->mult_work) {
1245: k = PetscMax(A->rmap->n,A->cmap->n);
1246: PetscMalloc1(k+1,&a->mult_work);
1247: }
1248: work = a->mult_work;
1249: for (i=0; i<mbs; i++) {
1250: n = ii[1] - ii[0]; ii++;
1251: ncols = n*bs;
1252: workt = work;
1253: for (j=0; j<n; j++) {
1254: xb = x + bs*(*idx++);
1255: for (k=0; k<bs; k++) workt[k] = xb[k];
1256: workt += bs;
1257: }
1258: if (usecprow) z = zarray + bs*ridx[i];
1259: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1260: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1261: v += n*bs2;
1262: if (!usecprow) z += bs;
1263: }
1264: VecRestoreArrayRead(xx,&x);
1265: VecRestoreArray(zz,&zarray);
1266: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1267: return(0);
1268: }
1270: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1271: {
1272: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1273: const PetscScalar *x;
1274: PetscScalar *y,*z,sum;
1275: const MatScalar *v;
1276: PetscErrorCode ierr;
1277: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1278: const PetscInt *idx,*ii;
1279: PetscBool usecprow=a->compressedrow.use;
1282: VecGetArrayRead(xx,&x);
1283: VecGetArrayPair(yy,zz,&y,&z);
1285: idx = a->j;
1286: v = a->a;
1287: if (usecprow) {
1288: if (zz != yy) {
1289: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1290: }
1291: mbs = a->compressedrow.nrows;
1292: ii = a->compressedrow.i;
1293: ridx = a->compressedrow.rindex;
1294: } else {
1295: ii = a->i;
1296: }
1298: for (i=0; i<mbs; i++) {
1299: n = ii[1] - ii[0];
1300: ii++;
1301: if (!usecprow) {
1302: sum = y[i];
1303: } else {
1304: sum = y[ridx[i]];
1305: }
1306: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1307: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1308: PetscSparseDensePlusDot(sum,x,v,idx,n);
1309: v += n;
1310: idx += n;
1311: if (usecprow) {
1312: z[ridx[i]] = sum;
1313: } else {
1314: z[i] = sum;
1315: }
1316: }
1317: VecRestoreArrayRead(xx,&x);
1318: VecRestoreArrayPair(yy,zz,&y,&z);
1319: PetscLogFlops(2.0*a->nz);
1320: return(0);
1321: }
1323: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1324: {
1325: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1326: PetscScalar *y = 0,*z = 0,sum1,sum2;
1327: const PetscScalar *x,*xb;
1328: PetscScalar x1,x2,*yarray,*zarray;
1329: const MatScalar *v;
1330: PetscErrorCode ierr;
1331: PetscInt mbs = a->mbs,i,n,j;
1332: const PetscInt *idx,*ii,*ridx = NULL;
1333: PetscBool usecprow = a->compressedrow.use;
1336: VecGetArrayRead(xx,&x);
1337: VecGetArrayPair(yy,zz,&yarray,&zarray);
1339: idx = a->j;
1340: v = a->a;
1341: if (usecprow) {
1342: if (zz != yy) {
1343: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1344: }
1345: mbs = a->compressedrow.nrows;
1346: ii = a->compressedrow.i;
1347: ridx = a->compressedrow.rindex;
1348: } else {
1349: ii = a->i;
1350: y = yarray;
1351: z = zarray;
1352: }
1354: for (i=0; i<mbs; i++) {
1355: n = ii[1] - ii[0]; ii++;
1356: if (usecprow) {
1357: z = zarray + 2*ridx[i];
1358: y = yarray + 2*ridx[i];
1359: }
1360: sum1 = y[0]; sum2 = y[1];
1361: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1362: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363: for (j=0; j<n; j++) {
1364: xb = x + 2*(*idx++);
1365: x1 = xb[0];
1366: x2 = xb[1];
1368: sum1 += v[0]*x1 + v[2]*x2;
1369: sum2 += v[1]*x1 + v[3]*x2;
1370: v += 4;
1371: }
1372: z[0] = sum1; z[1] = sum2;
1373: if (!usecprow) {
1374: z += 2; y += 2;
1375: }
1376: }
1377: VecRestoreArrayRead(xx,&x);
1378: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1379: PetscLogFlops(4.0*a->nz);
1380: return(0);
1381: }
1383: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1384: {
1385: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1386: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1387: const PetscScalar *x,*xb;
1388: const MatScalar *v;
1389: PetscErrorCode ierr;
1390: PetscInt mbs = a->mbs,i,j,n;
1391: const PetscInt *idx,*ii,*ridx = NULL;
1392: PetscBool usecprow = a->compressedrow.use;
1395: VecGetArrayRead(xx,&x);
1396: VecGetArrayPair(yy,zz,&yarray,&zarray);
1398: idx = a->j;
1399: v = a->a;
1400: if (usecprow) {
1401: if (zz != yy) {
1402: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1403: }
1404: mbs = a->compressedrow.nrows;
1405: ii = a->compressedrow.i;
1406: ridx = a->compressedrow.rindex;
1407: } else {
1408: ii = a->i;
1409: y = yarray;
1410: z = zarray;
1411: }
1413: for (i=0; i<mbs; i++) {
1414: n = ii[1] - ii[0]; ii++;
1415: if (usecprow) {
1416: z = zarray + 3*ridx[i];
1417: y = yarray + 3*ridx[i];
1418: }
1419: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1420: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1421: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1422: for (j=0; j<n; j++) {
1423: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1424: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1425: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1426: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1427: v += 9;
1428: }
1429: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1430: if (!usecprow) {
1431: z += 3; y += 3;
1432: }
1433: }
1434: VecRestoreArrayRead(xx,&x);
1435: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1436: PetscLogFlops(18.0*a->nz);
1437: return(0);
1438: }
1440: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1441: {
1442: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1443: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1444: const PetscScalar *x,*xb;
1445: const MatScalar *v;
1446: PetscErrorCode ierr;
1447: PetscInt mbs = a->mbs,i,j,n;
1448: const PetscInt *idx,*ii,*ridx=NULL;
1449: PetscBool usecprow=a->compressedrow.use;
1452: VecGetArrayRead(xx,&x);
1453: VecGetArrayPair(yy,zz,&yarray,&zarray);
1455: idx = a->j;
1456: v = a->a;
1457: if (usecprow) {
1458: if (zz != yy) {
1459: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1460: }
1461: mbs = a->compressedrow.nrows;
1462: ii = a->compressedrow.i;
1463: ridx = a->compressedrow.rindex;
1464: } else {
1465: ii = a->i;
1466: y = yarray;
1467: z = zarray;
1468: }
1470: for (i=0; i<mbs; i++) {
1471: n = ii[1] - ii[0]; ii++;
1472: if (usecprow) {
1473: z = zarray + 4*ridx[i];
1474: y = yarray + 4*ridx[i];
1475: }
1476: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1477: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1478: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1479: for (j=0; j<n; j++) {
1480: xb = x + 4*(*idx++);
1481: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1482: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1483: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1484: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1485: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1486: v += 16;
1487: }
1488: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1489: if (!usecprow) {
1490: z += 4; y += 4;
1491: }
1492: }
1493: VecRestoreArrayRead(xx,&x);
1494: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1495: PetscLogFlops(32.0*a->nz);
1496: return(0);
1497: }
1499: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1500: {
1501: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1502: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1503: const PetscScalar *x,*xb;
1504: PetscScalar *yarray,*zarray;
1505: const MatScalar *v;
1506: PetscErrorCode ierr;
1507: PetscInt mbs = a->mbs,i,j,n;
1508: const PetscInt *idx,*ii,*ridx = NULL;
1509: PetscBool usecprow=a->compressedrow.use;
1512: VecGetArrayRead(xx,&x);
1513: VecGetArrayPair(yy,zz,&yarray,&zarray);
1515: idx = a->j;
1516: v = a->a;
1517: if (usecprow) {
1518: if (zz != yy) {
1519: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1520: }
1521: mbs = a->compressedrow.nrows;
1522: ii = a->compressedrow.i;
1523: ridx = a->compressedrow.rindex;
1524: } else {
1525: ii = a->i;
1526: y = yarray;
1527: z = zarray;
1528: }
1530: for (i=0; i<mbs; i++) {
1531: n = ii[1] - ii[0]; ii++;
1532: if (usecprow) {
1533: z = zarray + 5*ridx[i];
1534: y = yarray + 5*ridx[i];
1535: }
1536: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1537: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1538: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1539: for (j=0; j<n; j++) {
1540: xb = x + 5*(*idx++);
1541: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1542: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1543: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1544: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1545: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1546: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1547: v += 25;
1548: }
1549: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1550: if (!usecprow) {
1551: z += 5; y += 5;
1552: }
1553: }
1554: VecRestoreArrayRead(xx,&x);
1555: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1556: PetscLogFlops(50.0*a->nz);
1557: return(0);
1558: }
1559: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1560: {
1561: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1562: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1563: const PetscScalar *x,*xb;
1564: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1565: const MatScalar *v;
1566: PetscErrorCode ierr;
1567: PetscInt mbs = a->mbs,i,j,n;
1568: const PetscInt *idx,*ii,*ridx=NULL;
1569: PetscBool usecprow=a->compressedrow.use;
1572: VecGetArrayRead(xx,&x);
1573: VecGetArrayPair(yy,zz,&yarray,&zarray);
1575: idx = a->j;
1576: v = a->a;
1577: if (usecprow) {
1578: if (zz != yy) {
1579: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1580: }
1581: mbs = a->compressedrow.nrows;
1582: ii = a->compressedrow.i;
1583: ridx = a->compressedrow.rindex;
1584: } else {
1585: ii = a->i;
1586: y = yarray;
1587: z = zarray;
1588: }
1590: for (i=0; i<mbs; i++) {
1591: n = ii[1] - ii[0]; ii++;
1592: if (usecprow) {
1593: z = zarray + 6*ridx[i];
1594: y = yarray + 6*ridx[i];
1595: }
1596: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1597: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1598: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1599: for (j=0; j<n; j++) {
1600: xb = x + 6*(*idx++);
1601: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1602: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1603: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1604: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1605: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1606: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1607: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1608: v += 36;
1609: }
1610: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1611: if (!usecprow) {
1612: z += 6; y += 6;
1613: }
1614: }
1615: VecRestoreArrayRead(xx,&x);
1616: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1617: PetscLogFlops(72.0*a->nz);
1618: return(0);
1619: }
1621: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1622: {
1623: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1624: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1625: const PetscScalar *x,*xb;
1626: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1627: const MatScalar *v;
1628: PetscErrorCode ierr;
1629: PetscInt mbs = a->mbs,i,j,n;
1630: const PetscInt *idx,*ii,*ridx = NULL;
1631: PetscBool usecprow=a->compressedrow.use;
1634: VecGetArrayRead(xx,&x);
1635: VecGetArrayPair(yy,zz,&yarray,&zarray);
1637: idx = a->j;
1638: v = a->a;
1639: if (usecprow) {
1640: if (zz != yy) {
1641: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1642: }
1643: mbs = a->compressedrow.nrows;
1644: ii = a->compressedrow.i;
1645: ridx = a->compressedrow.rindex;
1646: } else {
1647: ii = a->i;
1648: y = yarray;
1649: z = zarray;
1650: }
1652: for (i=0; i<mbs; i++) {
1653: n = ii[1] - ii[0]; ii++;
1654: if (usecprow) {
1655: z = zarray + 7*ridx[i];
1656: y = yarray + 7*ridx[i];
1657: }
1658: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1659: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1660: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1661: for (j=0; j<n; j++) {
1662: xb = x + 7*(*idx++);
1663: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1664: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1665: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1666: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1667: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1668: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1669: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1670: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1671: v += 49;
1672: }
1673: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1674: if (!usecprow) {
1675: z += 7; y += 7;
1676: }
1677: }
1678: VecRestoreArrayRead(xx,&x);
1679: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1680: PetscLogFlops(98.0*a->nz);
1681: return(0);
1682: }
1684: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1685: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1686: {
1687: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1688: PetscScalar *z = 0,*work,*workt,*zarray;
1689: const PetscScalar *x,*xb;
1690: const MatScalar *v;
1692: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1693: PetscInt k;
1694: PetscBool usecprow=a->compressedrow.use;
1695: const PetscInt *idx,*ii,*ridx=NULL;
1697: __m256d a0,a1,a2,a3,a4,a5;
1698: __m256d w0,w1,w2,w3;
1699: __m256d z0,z1,z2;
1700: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1703: VecCopy(yy,zz);
1704: VecGetArrayRead(xx,&x);
1705: VecGetArray(zz,&zarray);
1707: idx = a->j;
1708: v = a->a;
1709: if (usecprow) {
1710: mbs = a->compressedrow.nrows;
1711: ii = a->compressedrow.i;
1712: ridx = a->compressedrow.rindex;
1713: } else {
1714: mbs = a->mbs;
1715: ii = a->i;
1716: z = zarray;
1717: }
1719: if (!a->mult_work) {
1720: k = PetscMax(A->rmap->n,A->cmap->n);
1721: PetscMalloc1(k+1,&a->mult_work);
1722: }
1724: work = a->mult_work;
1725: for (i=0; i<mbs; i++) {
1726: n = ii[1] - ii[0]; ii++;
1727: workt = work;
1728: for (j=0; j<n; j++) {
1729: xb = x + bs*(*idx++);
1730: for (k=0; k<bs; k++) workt[k] = xb[k];
1731: workt += bs;
1732: }
1733: if (usecprow) z = zarray + bs*ridx[i];
1735: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1737: for (j=0; j<n; j++) {
1738: // first column of a
1739: w0 = _mm256_set1_pd(work[j*9 ]);
1740: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1741: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1742: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1744: // second column of a
1745: w1 = _mm256_set1_pd(work[j*9+ 1]);
1746: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1747: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1748: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1750: // third column of a
1751: w2 = _mm256_set1_pd(work[j*9+ 2]);
1752: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1753: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1754: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1756: // fourth column of a
1757: w3 = _mm256_set1_pd(work[j*9+ 3]);
1758: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1759: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1760: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1762: // fifth column of a
1763: w0 = _mm256_set1_pd(work[j*9+ 4]);
1764: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1765: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1766: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1768: // sixth column of a
1769: w1 = _mm256_set1_pd(work[j*9+ 5]);
1770: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1771: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1772: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1774: // seventh column of a
1775: w2 = _mm256_set1_pd(work[j*9+ 6]);
1776: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1777: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1778: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1780: // eigth column of a
1781: w3 = _mm256_set1_pd(work[j*9+ 7]);
1782: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1783: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1784: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1786: // ninth column of a
1787: w0 = _mm256_set1_pd(work[j*9+ 8]);
1788: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1789: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1790: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1791: }
1793: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1795: v += n*bs2;
1796: if (!usecprow) z += bs;
1797: }
1798: VecRestoreArrayRead(xx,&x);
1799: VecRestoreArray(zz,&zarray);
1800: PetscLogFlops(162.0*a->nz);
1801: return(0);
1802: }
1803: #endif
1805: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1806: {
1807: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1808: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1809: const PetscScalar *x,*xb;
1810: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1811: const MatScalar *v;
1812: PetscErrorCode ierr;
1813: PetscInt mbs = a->mbs,i,j,n;
1814: const PetscInt *idx,*ii,*ridx = NULL;
1815: PetscBool usecprow=a->compressedrow.use;
1818: VecGetArrayRead(xx,&x);
1819: VecGetArrayPair(yy,zz,&yarray,&zarray);
1821: idx = a->j;
1822: v = a->a;
1823: if (usecprow) {
1824: if (zz != yy) {
1825: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1826: }
1827: mbs = a->compressedrow.nrows;
1828: ii = a->compressedrow.i;
1829: ridx = a->compressedrow.rindex;
1830: } else {
1831: ii = a->i;
1832: y = yarray;
1833: z = zarray;
1834: }
1836: for (i=0; i<mbs; i++) {
1837: n = ii[1] - ii[0]; ii++;
1838: if (usecprow) {
1839: z = zarray + 11*ridx[i];
1840: y = yarray + 11*ridx[i];
1841: }
1842: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1843: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1844: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1845: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1846: for (j=0; j<n; j++) {
1847: xb = x + 11*(*idx++);
1848: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1849: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1850: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1851: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1852: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1853: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1854: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1855: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1856: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1857: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1858: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1859: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1860: v += 121;
1861: }
1862: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1863: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1864: if (!usecprow) {
1865: z += 11; y += 11;
1866: }
1867: }
1868: VecRestoreArrayRead(xx,&x);
1869: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1870: PetscLogFlops(242.0*a->nz);
1871: return(0);
1872: }
1874: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1875: {
1876: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1877: PetscScalar *z = 0,*work,*workt,*zarray;
1878: const PetscScalar *x,*xb;
1879: const MatScalar *v;
1880: PetscErrorCode ierr;
1881: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1882: PetscInt ncols,k;
1883: const PetscInt *ridx = NULL,*idx,*ii;
1884: PetscBool usecprow = a->compressedrow.use;
1887: VecCopy(yy,zz);
1888: VecGetArrayRead(xx,&x);
1889: VecGetArray(zz,&zarray);
1891: idx = a->j;
1892: v = a->a;
1893: if (usecprow) {
1894: mbs = a->compressedrow.nrows;
1895: ii = a->compressedrow.i;
1896: ridx = a->compressedrow.rindex;
1897: } else {
1898: mbs = a->mbs;
1899: ii = a->i;
1900: z = zarray;
1901: }
1903: if (!a->mult_work) {
1904: k = PetscMax(A->rmap->n,A->cmap->n);
1905: PetscMalloc1(k+1,&a->mult_work);
1906: }
1907: work = a->mult_work;
1908: for (i=0; i<mbs; i++) {
1909: n = ii[1] - ii[0]; ii++;
1910: ncols = n*bs;
1911: workt = work;
1912: for (j=0; j<n; j++) {
1913: xb = x + bs*(*idx++);
1914: for (k=0; k<bs; k++) workt[k] = xb[k];
1915: workt += bs;
1916: }
1917: if (usecprow) z = zarray + bs*ridx[i];
1918: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1919: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1920: v += n*bs2;
1921: if (!usecprow) z += bs;
1922: }
1923: VecRestoreArrayRead(xx,&x);
1924: VecRestoreArray(zz,&zarray);
1925: PetscLogFlops(2.0*a->nz*bs2);
1926: return(0);
1927: }
1929: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1930: {
1931: PetscScalar zero = 0.0;
1935: VecSet(zz,zero);
1936: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1937: return(0);
1938: }
1940: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1941: {
1942: PetscScalar zero = 0.0;
1946: VecSet(zz,zero);
1947: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1948: return(0);
1949: }
1951: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1952: {
1953: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1954: PetscScalar *z,x1,x2,x3,x4,x5;
1955: const PetscScalar *x,*xb = NULL;
1956: const MatScalar *v;
1957: PetscErrorCode ierr;
1958: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1959: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1960: Mat_CompressedRow cprow = a->compressedrow;
1961: PetscBool usecprow = cprow.use;
1964: if (yy != zz) { VecCopy(yy,zz); }
1965: VecGetArrayRead(xx,&x);
1966: VecGetArray(zz,&z);
1968: idx = a->j;
1969: v = a->a;
1970: if (usecprow) {
1971: mbs = cprow.nrows;
1972: ii = cprow.i;
1973: ridx = cprow.rindex;
1974: } else {
1975: mbs=a->mbs;
1976: ii = a->i;
1977: xb = x;
1978: }
1980: switch (bs) {
1981: case 1:
1982: for (i=0; i<mbs; i++) {
1983: if (usecprow) xb = x + ridx[i];
1984: x1 = xb[0];
1985: ib = idx + ii[0];
1986: n = ii[1] - ii[0]; ii++;
1987: for (j=0; j<n; j++) {
1988: rval = ib[j];
1989: z[rval] += PetscConj(*v) * x1;
1990: v++;
1991: }
1992: if (!usecprow) xb++;
1993: }
1994: break;
1995: case 2:
1996: for (i=0; i<mbs; i++) {
1997: if (usecprow) xb = x + 2*ridx[i];
1998: x1 = xb[0]; x2 = xb[1];
1999: ib = idx + ii[0];
2000: n = ii[1] - ii[0]; ii++;
2001: for (j=0; j<n; j++) {
2002: rval = ib[j]*2;
2003: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2004: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2005: v += 4;
2006: }
2007: if (!usecprow) xb += 2;
2008: }
2009: break;
2010: case 3:
2011: for (i=0; i<mbs; i++) {
2012: if (usecprow) xb = x + 3*ridx[i];
2013: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2014: ib = idx + ii[0];
2015: n = ii[1] - ii[0]; ii++;
2016: for (j=0; j<n; j++) {
2017: rval = ib[j]*3;
2018: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2019: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2020: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2021: v += 9;
2022: }
2023: if (!usecprow) xb += 3;
2024: }
2025: break;
2026: case 4:
2027: for (i=0; i<mbs; i++) {
2028: if (usecprow) xb = x + 4*ridx[i];
2029: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2030: ib = idx + ii[0];
2031: n = ii[1] - ii[0]; ii++;
2032: for (j=0; j<n; j++) {
2033: rval = ib[j]*4;
2034: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2035: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2036: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2037: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2038: v += 16;
2039: }
2040: if (!usecprow) xb += 4;
2041: }
2042: break;
2043: case 5:
2044: for (i=0; i<mbs; i++) {
2045: if (usecprow) xb = x + 5*ridx[i];
2046: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2047: x4 = xb[3]; x5 = xb[4];
2048: ib = idx + ii[0];
2049: n = ii[1] - ii[0]; ii++;
2050: for (j=0; j<n; j++) {
2051: rval = ib[j]*5;
2052: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2053: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2054: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2055: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2056: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2057: v += 25;
2058: }
2059: if (!usecprow) xb += 5;
2060: }
2061: break;
2062: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2063: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2064: #if 0
2065: {
2066: PetscInt ncols,k,bs2=a->bs2;
2067: PetscScalar *work,*workt,zb;
2068: const PetscScalar *xtmp;
2069: if (!a->mult_work) {
2070: k = PetscMax(A->rmap->n,A->cmap->n);
2071: PetscMalloc1(k+1,&a->mult_work);
2072: }
2073: work = a->mult_work;
2074: xtmp = x;
2075: for (i=0; i<mbs; i++) {
2076: n = ii[1] - ii[0]; ii++;
2077: ncols = n*bs;
2078: PetscMemzero(work,ncols*sizeof(PetscScalar));
2079: if (usecprow) xtmp = x + bs*ridx[i];
2080: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2081: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2082: v += n*bs2;
2083: if (!usecprow) xtmp += bs;
2084: workt = work;
2085: for (j=0; j<n; j++) {
2086: zb = z + bs*(*idx++);
2087: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2088: workt += bs;
2089: }
2090: }
2091: }
2092: #endif
2093: }
2094: VecRestoreArrayRead(xx,&x);
2095: VecRestoreArray(zz,&z);
2096: PetscLogFlops(2.0*a->nz*a->bs2);
2097: return(0);
2098: }
2100: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2101: {
2102: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2103: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2104: const PetscScalar *x,*xb = 0;
2105: const MatScalar *v;
2106: PetscErrorCode ierr;
2107: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2108: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2109: Mat_CompressedRow cprow = a->compressedrow;
2110: PetscBool usecprow=cprow.use;
2113: if (yy != zz) { VecCopy(yy,zz); }
2114: VecGetArrayRead(xx,&x);
2115: VecGetArray(zz,&z);
2117: idx = a->j;
2118: v = a->a;
2119: if (usecprow) {
2120: mbs = cprow.nrows;
2121: ii = cprow.i;
2122: ridx = cprow.rindex;
2123: } else {
2124: mbs=a->mbs;
2125: ii = a->i;
2126: xb = x;
2127: }
2129: switch (bs) {
2130: case 1:
2131: for (i=0; i<mbs; i++) {
2132: if (usecprow) xb = x + ridx[i];
2133: x1 = xb[0];
2134: ib = idx + ii[0];
2135: n = ii[1] - ii[0]; ii++;
2136: for (j=0; j<n; j++) {
2137: rval = ib[j];
2138: z[rval] += *v * x1;
2139: v++;
2140: }
2141: if (!usecprow) xb++;
2142: }
2143: break;
2144: case 2:
2145: for (i=0; i<mbs; i++) {
2146: if (usecprow) xb = x + 2*ridx[i];
2147: x1 = xb[0]; x2 = xb[1];
2148: ib = idx + ii[0];
2149: n = ii[1] - ii[0]; ii++;
2150: for (j=0; j<n; j++) {
2151: rval = ib[j]*2;
2152: z[rval++] += v[0]*x1 + v[1]*x2;
2153: z[rval++] += v[2]*x1 + v[3]*x2;
2154: v += 4;
2155: }
2156: if (!usecprow) xb += 2;
2157: }
2158: break;
2159: case 3:
2160: for (i=0; i<mbs; i++) {
2161: if (usecprow) xb = x + 3*ridx[i];
2162: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2163: ib = idx + ii[0];
2164: n = ii[1] - ii[0]; ii++;
2165: for (j=0; j<n; j++) {
2166: rval = ib[j]*3;
2167: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2168: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2169: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2170: v += 9;
2171: }
2172: if (!usecprow) xb += 3;
2173: }
2174: break;
2175: case 4:
2176: for (i=0; i<mbs; i++) {
2177: if (usecprow) xb = x + 4*ridx[i];
2178: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2179: ib = idx + ii[0];
2180: n = ii[1] - ii[0]; ii++;
2181: for (j=0; j<n; j++) {
2182: rval = ib[j]*4;
2183: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2184: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2185: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2186: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2187: v += 16;
2188: }
2189: if (!usecprow) xb += 4;
2190: }
2191: break;
2192: case 5:
2193: for (i=0; i<mbs; i++) {
2194: if (usecprow) xb = x + 5*ridx[i];
2195: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2196: x4 = xb[3]; x5 = xb[4];
2197: ib = idx + ii[0];
2198: n = ii[1] - ii[0]; ii++;
2199: for (j=0; j<n; j++) {
2200: rval = ib[j]*5;
2201: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2202: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2203: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2204: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2205: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2206: v += 25;
2207: }
2208: if (!usecprow) xb += 5;
2209: }
2210: break;
2211: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2212: PetscInt ncols,k;
2213: PetscScalar *work,*workt;
2214: const PetscScalar *xtmp;
2215: if (!a->mult_work) {
2216: k = PetscMax(A->rmap->n,A->cmap->n);
2217: PetscMalloc1(k+1,&a->mult_work);
2218: }
2219: work = a->mult_work;
2220: xtmp = x;
2221: for (i=0; i<mbs; i++) {
2222: n = ii[1] - ii[0]; ii++;
2223: ncols = n*bs;
2224: PetscMemzero(work,ncols*sizeof(PetscScalar));
2225: if (usecprow) xtmp = x + bs*ridx[i];
2226: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2227: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2228: v += n*bs2;
2229: if (!usecprow) xtmp += bs;
2230: workt = work;
2231: for (j=0; j<n; j++) {
2232: zb = z + bs*(*idx++);
2233: for (k=0; k<bs; k++) zb[k] += workt[k];
2234: workt += bs;
2235: }
2236: }
2237: }
2238: }
2239: VecRestoreArrayRead(xx,&x);
2240: VecRestoreArray(zz,&z);
2241: PetscLogFlops(2.0*a->nz*a->bs2);
2242: return(0);
2243: }
2245: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2246: {
2247: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2248: PetscInt totalnz = a->bs2*a->nz;
2249: PetscScalar oalpha = alpha;
2251: PetscBLASInt one = 1,tnz;
2254: PetscBLASIntCast(totalnz,&tnz);
2255: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2256: PetscLogFlops(totalnz);
2257: return(0);
2258: }
2260: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2261: {
2263: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2264: MatScalar *v = a->a;
2265: PetscReal sum = 0.0;
2266: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2269: if (type == NORM_FROBENIUS) {
2270: #if defined(PETSC_USE_REAL___FP16)
2271: PetscBLASInt one = 1,cnt = bs2*nz;
2272: *norm = BLASnrm2_(&cnt,v,&one);
2273: #else
2274: for (i=0; i<bs2*nz; i++) {
2275: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2276: }
2277: #endif
2278: *norm = PetscSqrtReal(sum);
2279: PetscLogFlops(2*bs2*nz);
2280: } else if (type == NORM_1) { /* maximum column sum */
2281: PetscReal *tmp;
2282: PetscInt *bcol = a->j;
2283: PetscCalloc1(A->cmap->n+1,&tmp);
2284: for (i=0; i<nz; i++) {
2285: for (j=0; j<bs; j++) {
2286: k1 = bs*(*bcol) + j; /* column index */
2287: for (k=0; k<bs; k++) {
2288: tmp[k1] += PetscAbsScalar(*v); v++;
2289: }
2290: }
2291: bcol++;
2292: }
2293: *norm = 0.0;
2294: for (j=0; j<A->cmap->n; j++) {
2295: if (tmp[j] > *norm) *norm = tmp[j];
2296: }
2297: PetscFree(tmp);
2298: PetscLogFlops(PetscMax(bs2*nz-1,0));
2299: } else if (type == NORM_INFINITY) { /* maximum row sum */
2300: *norm = 0.0;
2301: for (k=0; k<bs; k++) {
2302: for (j=0; j<a->mbs; j++) {
2303: v = a->a + bs2*a->i[j] + k;
2304: sum = 0.0;
2305: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2306: for (k1=0; k1<bs; k1++) {
2307: sum += PetscAbsScalar(*v);
2308: v += bs;
2309: }
2310: }
2311: if (sum > *norm) *norm = sum;
2312: }
2313: }
2314: PetscLogFlops(PetscMax(bs2*nz-1,0));
2315: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2316: return(0);
2317: }
2320: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2321: {
2322: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2326: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2327: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2328: *flg = PETSC_FALSE;
2329: return(0);
2330: }
2332: /* if the a->i are the same */
2333: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2334: if (!*flg) return(0);
2336: /* if a->j are the same */
2337: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2338: if (!*flg) return(0);
2340: /* if a->a are the same */
2341: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2342: return(0);
2344: }
2346: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2347: {
2348: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2350: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2351: PetscScalar *x,zero = 0.0;
2352: MatScalar *aa,*aa_j;
2355: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2356: bs = A->rmap->bs;
2357: aa = a->a;
2358: ai = a->i;
2359: aj = a->j;
2360: ambs = a->mbs;
2361: bs2 = a->bs2;
2363: VecSet(v,zero);
2364: VecGetArray(v,&x);
2365: VecGetLocalSize(v,&n);
2366: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2367: for (i=0; i<ambs; i++) {
2368: for (j=ai[i]; j<ai[i+1]; j++) {
2369: if (aj[j] == i) {
2370: row = i*bs;
2371: aa_j = aa+j*bs2;
2372: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2373: break;
2374: }
2375: }
2376: }
2377: VecRestoreArray(v,&x);
2378: return(0);
2379: }
2381: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2382: {
2383: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2384: const PetscScalar *l,*r,*li,*ri;
2385: PetscScalar x;
2386: MatScalar *aa, *v;
2387: PetscErrorCode ierr;
2388: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2389: const PetscInt *ai,*aj;
2392: ai = a->i;
2393: aj = a->j;
2394: aa = a->a;
2395: m = A->rmap->n;
2396: n = A->cmap->n;
2397: bs = A->rmap->bs;
2398: mbs = a->mbs;
2399: bs2 = a->bs2;
2400: if (ll) {
2401: VecGetArrayRead(ll,&l);
2402: VecGetLocalSize(ll,&lm);
2403: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2404: for (i=0; i<mbs; i++) { /* for each block row */
2405: M = ai[i+1] - ai[i];
2406: li = l + i*bs;
2407: v = aa + bs2*ai[i];
2408: for (j=0; j<M; j++) { /* for each block */
2409: for (k=0; k<bs2; k++) {
2410: (*v++) *= li[k%bs];
2411: }
2412: }
2413: }
2414: VecRestoreArrayRead(ll,&l);
2415: PetscLogFlops(a->nz);
2416: }
2418: if (rr) {
2419: VecGetArrayRead(rr,&r);
2420: VecGetLocalSize(rr,&rn);
2421: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2422: for (i=0; i<mbs; i++) { /* for each block row */
2423: iai = ai[i];
2424: M = ai[i+1] - iai;
2425: v = aa + bs2*iai;
2426: for (j=0; j<M; j++) { /* for each block */
2427: ri = r + bs*aj[iai+j];
2428: for (k=0; k<bs; k++) {
2429: x = ri[k];
2430: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2431: v += bs;
2432: }
2433: }
2434: }
2435: VecRestoreArrayRead(rr,&r);
2436: PetscLogFlops(a->nz);
2437: }
2438: return(0);
2439: }
2442: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2443: {
2444: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2447: info->block_size = a->bs2;
2448: info->nz_allocated = a->bs2*a->maxnz;
2449: info->nz_used = a->bs2*a->nz;
2450: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2451: info->assemblies = A->num_ass;
2452: info->mallocs = A->info.mallocs;
2453: info->memory = ((PetscObject)A)->mem;
2454: if (A->factortype) {
2455: info->fill_ratio_given = A->info.fill_ratio_given;
2456: info->fill_ratio_needed = A->info.fill_ratio_needed;
2457: info->factor_mallocs = A->info.factor_mallocs;
2458: } else {
2459: info->fill_ratio_given = 0;
2460: info->fill_ratio_needed = 0;
2461: info->factor_mallocs = 0;
2462: }
2463: return(0);
2464: }
2466: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2467: {
2468: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2472: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2473: return(0);
2474: }