Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #elif defined(PETSC_HAVE_XMMINTRIN_H)
10: #include <xmmintrin.h>
11: #endif
13: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
14: {
15: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
16: PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
17: const PetscInt *idx;
18: PetscInt start, end, *ai, *aj, bs;
19: PetscBT table;
21: PetscFunctionBegin;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
29: PetscCall(PetscBTCreate(m, &table));
30: PetscCall(PetscMalloc1(m + 1, &nidx));
32: for (i = 0; i < is_max; i++) {
33: /* Initialise the two local arrays */
34: isz = 0;
35: PetscCall(PetscBTMemzero(m, table));
37: /* Extract the indices, assume there can be duplicate entries */
38: PetscCall(ISGetIndices(is[i], &idx));
39: PetscCall(ISGetLocalSize(is[i], &n));
41: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42: for (j = 0; j < n; ++j) {
43: ival = idx[j] / bs; /* convert the indices into block indices */
44: PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
45: if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
46: }
47: PetscCall(ISRestoreIndices(is[i], &idx));
48: PetscCall(ISDestroy(&is[i]));
50: k = 0;
51: for (j = 0; j < ov; j++) { /* for each overlap*/
52: n = isz;
53: for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
54: row = nidx[k];
55: start = ai[row];
56: end = ai[row + 1];
57: for (l = start; l < end; l++) {
58: val = aj[l];
59: if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
60: }
61: }
62: }
63: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
64: }
65: PetscCall(PetscBTDestroy(&table));
66: PetscCall(PetscFree(nidx));
67: PetscFunctionReturn(PETSC_SUCCESS);
68: }
70: static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
71: {
72: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
73: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
74: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
75: const PetscInt *irow, *icol;
76: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
77: PetscInt *aj = a->j, *ai = a->i;
78: MatScalar *mat_a;
79: Mat C;
80: PetscBool flag;
82: PetscFunctionBegin;
83: PetscCall(ISGetIndices(isrow, &irow));
84: PetscCall(ISGetIndices(iscol, &icol));
85: PetscCall(ISGetLocalSize(isrow, &nrows));
86: PetscCall(ISGetLocalSize(iscol, &ncols));
88: PetscCall(PetscCalloc1(1 + oldcols, &smap));
89: ssmap = smap;
90: PetscCall(PetscMalloc1(1 + nrows, &lens));
91: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
92: /* determine lens of each row */
93: for (i = 0; i < nrows; i++) {
94: kstart = ai[irow[i]];
95: kend = kstart + a->ilen[irow[i]];
96: lens[i] = 0;
97: for (k = kstart; k < kend; k++) {
98: if (ssmap[aj[k]]) lens[i]++;
99: }
100: }
101: /* Create and fill new matrix */
102: if (scall == MAT_REUSE_MATRIX) {
103: c = (Mat_SeqBAIJ *)((*B)->data);
105: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
106: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
107: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
108: PetscCall(PetscArrayzero(c->ilen, c->mbs));
109: C = *B;
110: } else {
111: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
112: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
113: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
114: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
115: }
116: c = (Mat_SeqBAIJ *)C->data;
117: for (i = 0; i < nrows; i++) {
118: row = irow[i];
119: kstart = ai[row];
120: kend = kstart + a->ilen[row];
121: mat_i = c->i[i];
122: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
123: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
124: mat_ilen = c->ilen + i;
125: for (k = kstart; k < kend; k++) {
126: if ((tcol = ssmap[a->j[k]])) {
127: *mat_j++ = tcol - 1;
128: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
129: mat_a += bs2;
130: (*mat_ilen)++;
131: }
132: }
133: }
134: /* sort */
135: if (c->j && c->a) {
136: MatScalar *work;
137: PetscCall(PetscMalloc1(bs2, &work));
138: for (i = 0; i < nrows; i++) {
139: PetscInt ilen;
140: mat_i = c->i[i];
141: mat_j = c->j + mat_i;
142: mat_a = c->a + mat_i * bs2;
143: ilen = c->ilen[i];
144: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
145: }
146: PetscCall(PetscFree(work));
147: }
149: /* Free work space */
150: PetscCall(ISRestoreIndices(iscol, &icol));
151: PetscCall(PetscFree(smap));
152: PetscCall(PetscFree(lens));
153: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
154: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
156: PetscCall(ISRestoreIndices(isrow, &irow));
157: *B = C;
158: PetscFunctionReturn(PETSC_SUCCESS);
159: }
161: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
162: {
163: IS is1, is2;
165: PetscFunctionBegin;
166: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
167: if (isrow == iscol) {
168: is2 = is1;
169: PetscCall(PetscObjectReference((PetscObject)is2));
170: } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
171: PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
172: PetscCall(ISDestroy(&is1));
173: PetscCall(ISDestroy(&is2));
174: PetscFunctionReturn(PETSC_SUCCESS);
175: }
177: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
178: {
179: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
180: Mat_SubSppt *submatj = c->submatis1;
182: PetscFunctionBegin;
183: PetscCall((*submatj->destroy)(C));
184: PetscCall(MatDestroySubMatrix_Private(submatj));
185: PetscFunctionReturn(PETSC_SUCCESS);
186: }
188: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
189: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
190: {
191: PetscInt i;
192: Mat C;
193: Mat_SeqBAIJ *c;
194: Mat_SubSppt *submatj;
196: PetscFunctionBegin;
197: for (i = 0; i < n; i++) {
198: C = (*mat)[i];
199: c = (Mat_SeqBAIJ *)C->data;
200: submatj = c->submatis1;
201: if (submatj) {
202: if (--((PetscObject)C)->refct <= 0) {
203: PetscCall(PetscFree(C->factorprefix));
204: PetscCall((*submatj->destroy)(C));
205: PetscCall(MatDestroySubMatrix_Private(submatj));
206: PetscCall(PetscFree(C->defaultvectype));
207: PetscCall(PetscFree(C->defaultrandtype));
208: PetscCall(PetscLayoutDestroy(&C->rmap));
209: PetscCall(PetscLayoutDestroy(&C->cmap));
210: PetscCall(PetscHeaderDestroy(&C));
211: }
212: } else {
213: PetscCall(MatDestroy(&C));
214: }
215: }
217: /* Destroy Dummy submatrices created for reuse */
218: PetscCall(MatDestroySubMatrices_Dummy(n, mat));
220: PetscCall(PetscFree(*mat));
221: PetscFunctionReturn(PETSC_SUCCESS);
222: }
224: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
225: {
226: PetscInt i;
228: PetscFunctionBegin;
229: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
231: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: /* Should check that shapes of vectors and matrices match */
236: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
237: {
238: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
239: PetscScalar *z, sum;
240: const PetscScalar *x;
241: const MatScalar *v;
242: PetscInt mbs, i, n;
243: const PetscInt *idx, *ii, *ridx = NULL;
244: PetscBool usecprow = a->compressedrow.use;
246: PetscFunctionBegin;
247: PetscCall(VecGetArrayRead(xx, &x));
248: PetscCall(VecGetArrayWrite(zz, &z));
250: if (usecprow) {
251: mbs = a->compressedrow.nrows;
252: ii = a->compressedrow.i;
253: ridx = a->compressedrow.rindex;
254: PetscCall(PetscArrayzero(z, a->mbs));
255: } else {
256: mbs = a->mbs;
257: ii = a->i;
258: }
260: for (i = 0; i < mbs; i++) {
261: n = ii[1] - ii[0];
262: v = a->a + ii[0];
263: idx = a->j + ii[0];
264: ii++;
265: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266: PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267: sum = 0.0;
268: PetscSparseDensePlusDot(sum, x, v, idx, n);
269: if (usecprow) {
270: z[ridx[i]] = sum;
271: } else {
272: z[i] = sum;
273: }
274: }
275: PetscCall(VecRestoreArrayRead(xx, &x));
276: PetscCall(VecRestoreArrayWrite(zz, &z));
277: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
278: PetscFunctionReturn(PETSC_SUCCESS);
279: }
281: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
282: {
283: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
284: PetscScalar *z = NULL, sum1, sum2, *zarray;
285: const PetscScalar *x, *xb;
286: PetscScalar x1, x2;
287: const MatScalar *v;
288: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
289: PetscBool usecprow = a->compressedrow.use;
291: PetscFunctionBegin;
292: PetscCall(VecGetArrayRead(xx, &x));
293: PetscCall(VecGetArrayWrite(zz, &zarray));
295: idx = a->j;
296: v = a->a;
297: if (usecprow) {
298: mbs = a->compressedrow.nrows;
299: ii = a->compressedrow.i;
300: ridx = a->compressedrow.rindex;
301: PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
302: } else {
303: mbs = a->mbs;
304: ii = a->i;
305: z = zarray;
306: }
308: for (i = 0; i < mbs; i++) {
309: n = ii[1] - ii[0];
310: ii++;
311: sum1 = 0.0;
312: sum2 = 0.0;
313: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
314: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
315: for (j = 0; j < n; j++) {
316: xb = x + 2 * (*idx++);
317: x1 = xb[0];
318: x2 = xb[1];
319: sum1 += v[0] * x1 + v[2] * x2;
320: sum2 += v[1] * x1 + v[3] * x2;
321: v += 4;
322: }
323: if (usecprow) z = zarray + 2 * ridx[i];
324: z[0] = sum1;
325: z[1] = sum2;
326: if (!usecprow) z += 2;
327: }
328: PetscCall(VecRestoreArrayRead(xx, &x));
329: PetscCall(VecRestoreArrayWrite(zz, &zarray));
330: PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
331: PetscFunctionReturn(PETSC_SUCCESS);
332: }
334: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
335: {
336: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
337: PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
338: const PetscScalar *x, *xb;
339: const MatScalar *v;
340: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
341: PetscBool usecprow = a->compressedrow.use;
343: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
344: #pragma disjoint(*v, *z, *xb)
345: #endif
347: PetscFunctionBegin;
348: PetscCall(VecGetArrayRead(xx, &x));
349: PetscCall(VecGetArrayWrite(zz, &zarray));
351: idx = a->j;
352: v = a->a;
353: if (usecprow) {
354: mbs = a->compressedrow.nrows;
355: ii = a->compressedrow.i;
356: ridx = a->compressedrow.rindex;
357: PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
358: } else {
359: mbs = a->mbs;
360: ii = a->i;
361: z = zarray;
362: }
364: for (i = 0; i < mbs; i++) {
365: n = ii[1] - ii[0];
366: ii++;
367: sum1 = 0.0;
368: sum2 = 0.0;
369: sum3 = 0.0;
370: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
371: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
372: for (j = 0; j < n; j++) {
373: xb = x + 3 * (*idx++);
374: x1 = xb[0];
375: x2 = xb[1];
376: x3 = xb[2];
378: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
379: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
380: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
381: v += 9;
382: }
383: if (usecprow) z = zarray + 3 * ridx[i];
384: z[0] = sum1;
385: z[1] = sum2;
386: z[2] = sum3;
387: if (!usecprow) z += 3;
388: }
389: PetscCall(VecRestoreArrayRead(xx, &x));
390: PetscCall(VecRestoreArrayWrite(zz, &zarray));
391: PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
392: PetscFunctionReturn(PETSC_SUCCESS);
393: }
395: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
396: {
397: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
398: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
399: const PetscScalar *x, *xb;
400: const MatScalar *v;
401: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
402: PetscBool usecprow = a->compressedrow.use;
404: PetscFunctionBegin;
405: PetscCall(VecGetArrayRead(xx, &x));
406: PetscCall(VecGetArrayWrite(zz, &zarray));
408: idx = a->j;
409: v = a->a;
410: if (usecprow) {
411: mbs = a->compressedrow.nrows;
412: ii = a->compressedrow.i;
413: ridx = a->compressedrow.rindex;
414: PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
415: } else {
416: mbs = a->mbs;
417: ii = a->i;
418: z = zarray;
419: }
421: for (i = 0; i < mbs; i++) {
422: n = ii[1] - ii[0];
423: ii++;
424: sum1 = 0.0;
425: sum2 = 0.0;
426: sum3 = 0.0;
427: sum4 = 0.0;
429: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
430: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431: for (j = 0; j < n; j++) {
432: xb = x + 4 * (*idx++);
433: x1 = xb[0];
434: x2 = xb[1];
435: x3 = xb[2];
436: x4 = xb[3];
437: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
438: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
439: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
440: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
441: v += 16;
442: }
443: if (usecprow) z = zarray + 4 * ridx[i];
444: z[0] = sum1;
445: z[1] = sum2;
446: z[2] = sum3;
447: z[3] = sum4;
448: if (!usecprow) z += 4;
449: }
450: PetscCall(VecRestoreArrayRead(xx, &x));
451: PetscCall(VecRestoreArrayWrite(zz, &zarray));
452: PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
453: PetscFunctionReturn(PETSC_SUCCESS);
454: }
456: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
457: {
458: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
459: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
460: const PetscScalar *xb, *x;
461: const MatScalar *v;
462: const PetscInt *idx, *ii, *ridx = NULL;
463: PetscInt mbs, i, j, n;
464: PetscBool usecprow = a->compressedrow.use;
466: PetscFunctionBegin;
467: PetscCall(VecGetArrayRead(xx, &x));
468: PetscCall(VecGetArrayWrite(zz, &zarray));
470: idx = a->j;
471: v = a->a;
472: if (usecprow) {
473: mbs = a->compressedrow.nrows;
474: ii = a->compressedrow.i;
475: ridx = a->compressedrow.rindex;
476: PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
477: } else {
478: mbs = a->mbs;
479: ii = a->i;
480: z = zarray;
481: }
483: for (i = 0; i < mbs; i++) {
484: n = ii[1] - ii[0];
485: ii++;
486: sum1 = 0.0;
487: sum2 = 0.0;
488: sum3 = 0.0;
489: sum4 = 0.0;
490: sum5 = 0.0;
491: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493: for (j = 0; j < n; j++) {
494: xb = x + 5 * (*idx++);
495: x1 = xb[0];
496: x2 = xb[1];
497: x3 = xb[2];
498: x4 = xb[3];
499: x5 = xb[4];
500: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
501: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
502: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
503: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
504: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
505: v += 25;
506: }
507: if (usecprow) z = zarray + 5 * ridx[i];
508: z[0] = sum1;
509: z[1] = sum2;
510: z[2] = sum3;
511: z[3] = sum4;
512: z[4] = sum5;
513: if (!usecprow) z += 5;
514: }
515: PetscCall(VecRestoreArrayRead(xx, &x));
516: PetscCall(VecRestoreArrayWrite(zz, &zarray));
517: PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
518: PetscFunctionReturn(PETSC_SUCCESS);
519: }
521: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
522: {
523: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
524: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
525: const PetscScalar *x, *xb;
526: PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
527: const MatScalar *v;
528: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
529: PetscBool usecprow = a->compressedrow.use;
531: PetscFunctionBegin;
532: PetscCall(VecGetArrayRead(xx, &x));
533: PetscCall(VecGetArrayWrite(zz, &zarray));
535: idx = a->j;
536: v = a->a;
537: if (usecprow) {
538: mbs = a->compressedrow.nrows;
539: ii = a->compressedrow.i;
540: ridx = a->compressedrow.rindex;
541: PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
542: } else {
543: mbs = a->mbs;
544: ii = a->i;
545: z = zarray;
546: }
548: for (i = 0; i < mbs; i++) {
549: n = ii[1] - ii[0];
550: ii++;
551: sum1 = 0.0;
552: sum2 = 0.0;
553: sum3 = 0.0;
554: sum4 = 0.0;
555: sum5 = 0.0;
556: sum6 = 0.0;
558: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
559: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
560: for (j = 0; j < n; j++) {
561: xb = x + 6 * (*idx++);
562: x1 = xb[0];
563: x2 = xb[1];
564: x3 = xb[2];
565: x4 = xb[3];
566: x5 = xb[4];
567: x6 = xb[5];
568: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
569: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
570: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
571: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
572: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
573: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
574: v += 36;
575: }
576: if (usecprow) z = zarray + 6 * ridx[i];
577: z[0] = sum1;
578: z[1] = sum2;
579: z[2] = sum3;
580: z[3] = sum4;
581: z[4] = sum5;
582: z[5] = sum6;
583: if (!usecprow) z += 6;
584: }
586: PetscCall(VecRestoreArrayRead(xx, &x));
587: PetscCall(VecRestoreArrayWrite(zz, &zarray));
588: PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
589: PetscFunctionReturn(PETSC_SUCCESS);
590: }
592: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
593: {
594: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
595: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
596: const PetscScalar *x, *xb;
597: PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
598: const MatScalar *v;
599: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
600: PetscBool usecprow = a->compressedrow.use;
602: PetscFunctionBegin;
603: PetscCall(VecGetArrayRead(xx, &x));
604: PetscCall(VecGetArrayWrite(zz, &zarray));
606: idx = a->j;
607: v = a->a;
608: if (usecprow) {
609: mbs = a->compressedrow.nrows;
610: ii = a->compressedrow.i;
611: ridx = a->compressedrow.rindex;
612: PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
613: } else {
614: mbs = a->mbs;
615: ii = a->i;
616: z = zarray;
617: }
619: for (i = 0; i < mbs; i++) {
620: n = ii[1] - ii[0];
621: ii++;
622: sum1 = 0.0;
623: sum2 = 0.0;
624: sum3 = 0.0;
625: sum4 = 0.0;
626: sum5 = 0.0;
627: sum6 = 0.0;
628: sum7 = 0.0;
630: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
631: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
632: for (j = 0; j < n; j++) {
633: xb = x + 7 * (*idx++);
634: x1 = xb[0];
635: x2 = xb[1];
636: x3 = xb[2];
637: x4 = xb[3];
638: x5 = xb[4];
639: x6 = xb[5];
640: x7 = xb[6];
641: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
642: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
643: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
644: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
645: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
646: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
647: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
648: v += 49;
649: }
650: if (usecprow) z = zarray + 7 * ridx[i];
651: z[0] = sum1;
652: z[1] = sum2;
653: z[2] = sum3;
654: z[3] = sum4;
655: z[4] = sum5;
656: z[5] = sum6;
657: z[6] = sum7;
658: if (!usecprow) z += 7;
659: }
661: PetscCall(VecRestoreArrayRead(xx, &x));
662: PetscCall(VecRestoreArrayWrite(zz, &zarray));
663: PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
664: PetscFunctionReturn(PETSC_SUCCESS);
665: }
667: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
668: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
669: {
670: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
671: PetscScalar *z = NULL, *work, *workt, *zarray;
672: const PetscScalar *x, *xb;
673: const MatScalar *v;
674: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
675: const PetscInt *idx, *ii, *ridx = NULL;
676: PetscInt k;
677: PetscBool usecprow = a->compressedrow.use;
679: __m256d a0, a1, a2, a3, a4, a5;
680: __m256d w0, w1, w2, w3;
681: __m256d z0, z1, z2;
682: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
684: PetscFunctionBegin;
685: PetscCall(VecGetArrayRead(xx, &x));
686: PetscCall(VecGetArrayWrite(zz, &zarray));
688: idx = a->j;
689: v = a->a;
690: if (usecprow) {
691: mbs = a->compressedrow.nrows;
692: ii = a->compressedrow.i;
693: ridx = a->compressedrow.rindex;
694: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
695: } else {
696: mbs = a->mbs;
697: ii = a->i;
698: z = zarray;
699: }
701: if (!a->mult_work) {
702: k = PetscMax(A->rmap->n, A->cmap->n);
703: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
704: }
706: work = a->mult_work;
707: for (i = 0; i < mbs; i++) {
708: n = ii[1] - ii[0];
709: ii++;
710: workt = work;
711: for (j = 0; j < n; j++) {
712: xb = x + bs * (*idx++);
713: for (k = 0; k < bs; k++) workt[k] = xb[k];
714: workt += bs;
715: }
716: if (usecprow) z = zarray + bs * ridx[i];
718: z0 = _mm256_setzero_pd();
719: z1 = _mm256_setzero_pd();
720: z2 = _mm256_setzero_pd();
722: for (j = 0; j < n; j++) {
723: /* first column of a */
724: w0 = _mm256_set1_pd(work[j * 9]);
725: a0 = _mm256_loadu_pd(&v[j * 81]);
726: z0 = _mm256_fmadd_pd(a0, w0, z0);
727: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
728: z1 = _mm256_fmadd_pd(a1, w0, z1);
729: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
730: z2 = _mm256_fmadd_pd(a2, w0, z2);
732: /* second column of a */
733: w1 = _mm256_set1_pd(work[j * 9 + 1]);
734: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
735: z0 = _mm256_fmadd_pd(a0, w1, z0);
736: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
737: z1 = _mm256_fmadd_pd(a1, w1, z1);
738: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
739: z2 = _mm256_fmadd_pd(a2, w1, z2);
741: /* third column of a */
742: w2 = _mm256_set1_pd(work[j * 9 + 2]);
743: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
744: z0 = _mm256_fmadd_pd(a3, w2, z0);
745: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
746: z1 = _mm256_fmadd_pd(a4, w2, z1);
747: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
748: z2 = _mm256_fmadd_pd(a5, w2, z2);
750: /* fourth column of a */
751: w3 = _mm256_set1_pd(work[j * 9 + 3]);
752: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
753: z0 = _mm256_fmadd_pd(a0, w3, z0);
754: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
755: z1 = _mm256_fmadd_pd(a1, w3, z1);
756: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
757: z2 = _mm256_fmadd_pd(a2, w3, z2);
759: /* fifth column of a */
760: w0 = _mm256_set1_pd(work[j * 9 + 4]);
761: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
762: z0 = _mm256_fmadd_pd(a3, w0, z0);
763: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
764: z1 = _mm256_fmadd_pd(a4, w0, z1);
765: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
766: z2 = _mm256_fmadd_pd(a5, w0, z2);
768: /* sixth column of a */
769: w1 = _mm256_set1_pd(work[j * 9 + 5]);
770: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
771: z0 = _mm256_fmadd_pd(a0, w1, z0);
772: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
773: z1 = _mm256_fmadd_pd(a1, w1, z1);
774: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
775: z2 = _mm256_fmadd_pd(a2, w1, z2);
777: /* seventh column of a */
778: w2 = _mm256_set1_pd(work[j * 9 + 6]);
779: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
780: z0 = _mm256_fmadd_pd(a0, w2, z0);
781: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
782: z1 = _mm256_fmadd_pd(a1, w2, z1);
783: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
784: z2 = _mm256_fmadd_pd(a2, w2, z2);
786: /* eighth column of a */
787: w3 = _mm256_set1_pd(work[j * 9 + 7]);
788: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
789: z0 = _mm256_fmadd_pd(a3, w3, z0);
790: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
791: z1 = _mm256_fmadd_pd(a4, w3, z1);
792: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
793: z2 = _mm256_fmadd_pd(a5, w3, z2);
795: /* ninth column of a */
796: w0 = _mm256_set1_pd(work[j * 9 + 8]);
797: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
798: z0 = _mm256_fmadd_pd(a0, w0, z0);
799: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
800: z1 = _mm256_fmadd_pd(a1, w0, z1);
801: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
802: z2 = _mm256_fmadd_pd(a2, w0, z2);
803: }
805: _mm256_storeu_pd(&z[0], z0);
806: _mm256_storeu_pd(&z[4], z1);
807: _mm256_maskstore_pd(&z[8], mask1, z2);
809: v += n * bs2;
810: if (!usecprow) z += bs;
811: }
812: PetscCall(VecRestoreArrayRead(xx, &x));
813: PetscCall(VecRestoreArrayWrite(zz, &zarray));
814: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
815: PetscFunctionReturn(PETSC_SUCCESS);
816: }
817: #endif
819: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
820: {
821: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
822: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
823: const PetscScalar *x, *xb;
824: PetscScalar *zarray, xv;
825: const MatScalar *v;
826: const PetscInt *ii, *ij = a->j, *idx;
827: PetscInt mbs, i, j, k, n, *ridx = NULL;
828: PetscBool usecprow = a->compressedrow.use;
830: PetscFunctionBegin;
831: PetscCall(VecGetArrayRead(xx, &x));
832: PetscCall(VecGetArrayWrite(zz, &zarray));
834: v = a->a;
835: if (usecprow) {
836: mbs = a->compressedrow.nrows;
837: ii = a->compressedrow.i;
838: ridx = a->compressedrow.rindex;
839: PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
840: } else {
841: mbs = a->mbs;
842: ii = a->i;
843: z = zarray;
844: }
846: for (i = 0; i < mbs; i++) {
847: n = ii[i + 1] - ii[i];
848: idx = ij + ii[i];
849: sum1 = 0.0;
850: sum2 = 0.0;
851: sum3 = 0.0;
852: sum4 = 0.0;
853: sum5 = 0.0;
854: sum6 = 0.0;
855: sum7 = 0.0;
856: sum8 = 0.0;
857: sum9 = 0.0;
858: sum10 = 0.0;
859: sum11 = 0.0;
861: for (j = 0; j < n; j++) {
862: xb = x + 11 * (idx[j]);
864: for (k = 0; k < 11; k++) {
865: xv = xb[k];
866: sum1 += v[0] * xv;
867: sum2 += v[1] * xv;
868: sum3 += v[2] * xv;
869: sum4 += v[3] * xv;
870: sum5 += v[4] * xv;
871: sum6 += v[5] * xv;
872: sum7 += v[6] * xv;
873: sum8 += v[7] * xv;
874: sum9 += v[8] * xv;
875: sum10 += v[9] * xv;
876: sum11 += v[10] * xv;
877: v += 11;
878: }
879: }
880: if (usecprow) z = zarray + 11 * ridx[i];
881: z[0] = sum1;
882: z[1] = sum2;
883: z[2] = sum3;
884: z[3] = sum4;
885: z[4] = sum5;
886: z[5] = sum6;
887: z[6] = sum7;
888: z[7] = sum8;
889: z[8] = sum9;
890: z[9] = sum10;
891: z[10] = sum11;
893: if (!usecprow) z += 11;
894: }
896: PetscCall(VecRestoreArrayRead(xx, &x));
897: PetscCall(VecRestoreArrayWrite(zz, &zarray));
898: PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
899: PetscFunctionReturn(PETSC_SUCCESS);
900: }
902: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
903: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
904: {
905: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
906: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
907: const PetscScalar *x, *xb;
908: PetscScalar *zarray, xv;
909: const MatScalar *v;
910: const PetscInt *ii, *ij = a->j, *idx;
911: PetscInt mbs, i, j, k, n, *ridx = NULL;
912: PetscBool usecprow = a->compressedrow.use;
914: PetscFunctionBegin;
915: PetscCall(VecGetArrayRead(xx, &x));
916: PetscCall(VecGetArrayWrite(zz, &zarray));
918: v = a->a;
919: if (usecprow) {
920: mbs = a->compressedrow.nrows;
921: ii = a->compressedrow.i;
922: ridx = a->compressedrow.rindex;
923: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
924: } else {
925: mbs = a->mbs;
926: ii = a->i;
927: z = zarray;
928: }
930: for (i = 0; i < mbs; i++) {
931: n = ii[i + 1] - ii[i];
932: idx = ij + ii[i];
933: sum1 = 0.0;
934: sum2 = 0.0;
935: sum3 = 0.0;
936: sum4 = 0.0;
937: sum5 = 0.0;
938: sum6 = 0.0;
939: sum7 = 0.0;
940: sum8 = 0.0;
941: sum9 = 0.0;
942: sum10 = 0.0;
943: sum11 = 0.0;
944: sum12 = 0.0;
946: for (j = 0; j < n; j++) {
947: xb = x + 12 * (idx[j]);
949: for (k = 0; k < 12; k++) {
950: xv = xb[k];
951: sum1 += v[0] * xv;
952: sum2 += v[1] * xv;
953: sum3 += v[2] * xv;
954: sum4 += v[3] * xv;
955: sum5 += v[4] * xv;
956: sum6 += v[5] * xv;
957: sum7 += v[6] * xv;
958: sum8 += v[7] * xv;
959: sum9 += v[8] * xv;
960: sum10 += v[9] * xv;
961: sum11 += v[10] * xv;
962: sum12 += v[11] * xv;
963: v += 12;
964: }
965: }
966: if (usecprow) z = zarray + 12 * ridx[i];
967: z[0] = sum1;
968: z[1] = sum2;
969: z[2] = sum3;
970: z[3] = sum4;
971: z[4] = sum5;
972: z[5] = sum6;
973: z[6] = sum7;
974: z[7] = sum8;
975: z[8] = sum9;
976: z[9] = sum10;
977: z[10] = sum11;
978: z[11] = sum12;
979: if (!usecprow) z += 12;
980: }
981: PetscCall(VecRestoreArrayRead(xx, &x));
982: PetscCall(VecRestoreArrayWrite(zz, &zarray));
983: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
988: {
989: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
990: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
991: const PetscScalar *x, *xb;
992: PetscScalar *zarray, *yarray, xv;
993: const MatScalar *v;
994: const PetscInt *ii, *ij = a->j, *idx;
995: PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
996: PetscBool usecprow = a->compressedrow.use;
998: PetscFunctionBegin;
999: PetscCall(VecGetArrayRead(xx, &x));
1000: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1002: v = a->a;
1003: if (usecprow) {
1004: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1005: mbs = a->compressedrow.nrows;
1006: ii = a->compressedrow.i;
1007: ridx = a->compressedrow.rindex;
1008: } else {
1009: ii = a->i;
1010: y = yarray;
1011: z = zarray;
1012: }
1014: for (i = 0; i < mbs; i++) {
1015: n = ii[i + 1] - ii[i];
1016: idx = ij + ii[i];
1018: if (usecprow) {
1019: y = yarray + 12 * ridx[i];
1020: z = zarray + 12 * ridx[i];
1021: }
1022: sum1 = y[0];
1023: sum2 = y[1];
1024: sum3 = y[2];
1025: sum4 = y[3];
1026: sum5 = y[4];
1027: sum6 = y[5];
1028: sum7 = y[6];
1029: sum8 = y[7];
1030: sum9 = y[8];
1031: sum10 = y[9];
1032: sum11 = y[10];
1033: sum12 = y[11];
1035: for (j = 0; j < n; j++) {
1036: xb = x + 12 * (idx[j]);
1038: for (k = 0; k < 12; k++) {
1039: xv = xb[k];
1040: sum1 += v[0] * xv;
1041: sum2 += v[1] * xv;
1042: sum3 += v[2] * xv;
1043: sum4 += v[3] * xv;
1044: sum5 += v[4] * xv;
1045: sum6 += v[5] * xv;
1046: sum7 += v[6] * xv;
1047: sum8 += v[7] * xv;
1048: sum9 += v[8] * xv;
1049: sum10 += v[9] * xv;
1050: sum11 += v[10] * xv;
1051: sum12 += v[11] * xv;
1052: v += 12;
1053: }
1054: }
1056: z[0] = sum1;
1057: z[1] = sum2;
1058: z[2] = sum3;
1059: z[3] = sum4;
1060: z[4] = sum5;
1061: z[5] = sum6;
1062: z[6] = sum7;
1063: z[7] = sum8;
1064: z[8] = sum9;
1065: z[9] = sum10;
1066: z[10] = sum11;
1067: z[11] = sum12;
1068: if (!usecprow) {
1069: y += 12;
1070: z += 12;
1071: }
1072: }
1073: PetscCall(VecRestoreArrayRead(xx, &x));
1074: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1075: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1076: PetscFunctionReturn(PETSC_SUCCESS);
1077: }
1079: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1080: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1081: {
1082: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1083: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1084: const PetscScalar *x, *xb;
1085: PetscScalar x1, x2, x3, x4, *zarray;
1086: const MatScalar *v;
1087: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1088: PetscInt mbs, i, j, n;
1089: PetscBool usecprow = a->compressedrow.use;
1091: PetscFunctionBegin;
1092: PetscCall(VecGetArrayRead(xx, &x));
1093: PetscCall(VecGetArrayWrite(zz, &zarray));
1095: v = a->a;
1096: if (usecprow) {
1097: mbs = a->compressedrow.nrows;
1098: ii = a->compressedrow.i;
1099: ridx = a->compressedrow.rindex;
1100: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1101: } else {
1102: mbs = a->mbs;
1103: ii = a->i;
1104: z = zarray;
1105: }
1107: for (i = 0; i < mbs; i++) {
1108: n = ii[i + 1] - ii[i];
1109: idx = ij + ii[i];
1111: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1112: for (j = 0; j < n; j++) {
1113: xb = x + 12 * (idx[j]);
1114: x1 = xb[0];
1115: x2 = xb[1];
1116: x3 = xb[2];
1117: x4 = xb[3];
1119: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1120: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1121: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1122: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1123: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1124: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1125: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1126: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1127: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1128: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1129: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1130: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1131: v += 48;
1133: x1 = xb[4];
1134: x2 = xb[5];
1135: x3 = xb[6];
1136: x4 = xb[7];
1138: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150: v += 48;
1152: x1 = xb[8];
1153: x2 = xb[9];
1154: x3 = xb[10];
1155: x4 = xb[11];
1156: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168: v += 48;
1169: }
1170: if (usecprow) z = zarray + 12 * ridx[i];
1171: z[0] = sum1;
1172: z[1] = sum2;
1173: z[2] = sum3;
1174: z[3] = sum4;
1175: z[4] = sum5;
1176: z[5] = sum6;
1177: z[6] = sum7;
1178: z[7] = sum8;
1179: z[8] = sum9;
1180: z[9] = sum10;
1181: z[10] = sum11;
1182: z[11] = sum12;
1183: if (!usecprow) z += 12;
1184: }
1185: PetscCall(VecRestoreArrayRead(xx, &x));
1186: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1187: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1188: PetscFunctionReturn(PETSC_SUCCESS);
1189: }
1191: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1192: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1193: {
1194: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1195: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1196: const PetscScalar *x, *xb;
1197: PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1198: const MatScalar *v;
1199: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1200: PetscInt mbs = a->mbs, i, j, n;
1201: PetscBool usecprow = a->compressedrow.use;
1203: PetscFunctionBegin;
1204: PetscCall(VecGetArrayRead(xx, &x));
1205: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1207: v = a->a;
1208: if (usecprow) {
1209: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1210: mbs = a->compressedrow.nrows;
1211: ii = a->compressedrow.i;
1212: ridx = a->compressedrow.rindex;
1213: } else {
1214: ii = a->i;
1215: y = yarray;
1216: z = zarray;
1217: }
1219: for (i = 0; i < mbs; i++) {
1220: n = ii[i + 1] - ii[i];
1221: idx = ij + ii[i];
1223: if (usecprow) {
1224: y = yarray + 12 * ridx[i];
1225: z = zarray + 12 * ridx[i];
1226: }
1227: sum1 = y[0];
1228: sum2 = y[1];
1229: sum3 = y[2];
1230: sum4 = y[3];
1231: sum5 = y[4];
1232: sum6 = y[5];
1233: sum7 = y[6];
1234: sum8 = y[7];
1235: sum9 = y[8];
1236: sum10 = y[9];
1237: sum11 = y[10];
1238: sum12 = y[11];
1240: for (j = 0; j < n; j++) {
1241: xb = x + 12 * (idx[j]);
1242: x1 = xb[0];
1243: x2 = xb[1];
1244: x3 = xb[2];
1245: x4 = xb[3];
1247: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1248: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1249: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1250: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1251: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1252: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1253: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1254: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1255: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1256: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1257: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1258: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1259: v += 48;
1261: x1 = xb[4];
1262: x2 = xb[5];
1263: x3 = xb[6];
1264: x4 = xb[7];
1266: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1267: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1268: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1269: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1270: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1271: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1272: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1273: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1274: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1275: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1276: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1277: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1278: v += 48;
1280: x1 = xb[8];
1281: x2 = xb[9];
1282: x3 = xb[10];
1283: x4 = xb[11];
1284: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296: v += 48;
1297: }
1298: z[0] = sum1;
1299: z[1] = sum2;
1300: z[2] = sum3;
1301: z[3] = sum4;
1302: z[4] = sum5;
1303: z[5] = sum6;
1304: z[6] = sum7;
1305: z[7] = sum8;
1306: z[8] = sum9;
1307: z[9] = sum10;
1308: z[10] = sum11;
1309: z[11] = sum12;
1310: if (!usecprow) {
1311: y += 12;
1312: z += 12;
1313: }
1314: }
1315: PetscCall(VecRestoreArrayRead(xx, &x));
1316: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1317: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1318: PetscFunctionReturn(PETSC_SUCCESS);
1319: }
1321: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1322: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1323: {
1324: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1325: PetscScalar *z = NULL, *zarray;
1326: const PetscScalar *x, *work;
1327: const MatScalar *v = a->a;
1328: PetscInt mbs, i, j, n;
1329: const PetscInt *idx = a->j, *ii, *ridx = NULL;
1330: PetscBool usecprow = a->compressedrow.use;
1331: const PetscInt bs = 12, bs2 = 144;
1333: __m256d a0, a1, a2, a3, a4, a5;
1334: __m256d w0, w1, w2, w3;
1335: __m256d z0, z1, z2;
1337: PetscFunctionBegin;
1338: PetscCall(VecGetArrayRead(xx, &x));
1339: PetscCall(VecGetArrayWrite(zz, &zarray));
1341: if (usecprow) {
1342: mbs = a->compressedrow.nrows;
1343: ii = a->compressedrow.i;
1344: ridx = a->compressedrow.rindex;
1345: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1346: } else {
1347: mbs = a->mbs;
1348: ii = a->i;
1349: z = zarray;
1350: }
1352: for (i = 0; i < mbs; i++) {
1353: z0 = _mm256_setzero_pd();
1354: z1 = _mm256_setzero_pd();
1355: z2 = _mm256_setzero_pd();
1357: n = ii[1] - ii[0];
1358: ii++;
1359: for (j = 0; j < n; j++) {
1360: work = x + bs * (*idx++);
1362: /* first column of a */
1363: w0 = _mm256_set1_pd(work[0]);
1364: a0 = _mm256_loadu_pd(v + 0);
1365: z0 = _mm256_fmadd_pd(a0, w0, z0);
1366: a1 = _mm256_loadu_pd(v + 4);
1367: z1 = _mm256_fmadd_pd(a1, w0, z1);
1368: a2 = _mm256_loadu_pd(v + 8);
1369: z2 = _mm256_fmadd_pd(a2, w0, z2);
1371: /* second column of a */
1372: w1 = _mm256_set1_pd(work[1]);
1373: a3 = _mm256_loadu_pd(v + 12);
1374: z0 = _mm256_fmadd_pd(a3, w1, z0);
1375: a4 = _mm256_loadu_pd(v + 16);
1376: z1 = _mm256_fmadd_pd(a4, w1, z1);
1377: a5 = _mm256_loadu_pd(v + 20);
1378: z2 = _mm256_fmadd_pd(a5, w1, z2);
1380: /* third column of a */
1381: w2 = _mm256_set1_pd(work[2]);
1382: a0 = _mm256_loadu_pd(v + 24);
1383: z0 = _mm256_fmadd_pd(a0, w2, z0);
1384: a1 = _mm256_loadu_pd(v + 28);
1385: z1 = _mm256_fmadd_pd(a1, w2, z1);
1386: a2 = _mm256_loadu_pd(v + 32);
1387: z2 = _mm256_fmadd_pd(a2, w2, z2);
1389: /* fourth column of a */
1390: w3 = _mm256_set1_pd(work[3]);
1391: a3 = _mm256_loadu_pd(v + 36);
1392: z0 = _mm256_fmadd_pd(a3, w3, z0);
1393: a4 = _mm256_loadu_pd(v + 40);
1394: z1 = _mm256_fmadd_pd(a4, w3, z1);
1395: a5 = _mm256_loadu_pd(v + 44);
1396: z2 = _mm256_fmadd_pd(a5, w3, z2);
1398: /* fifth column of a */
1399: w0 = _mm256_set1_pd(work[4]);
1400: a0 = _mm256_loadu_pd(v + 48);
1401: z0 = _mm256_fmadd_pd(a0, w0, z0);
1402: a1 = _mm256_loadu_pd(v + 52);
1403: z1 = _mm256_fmadd_pd(a1, w0, z1);
1404: a2 = _mm256_loadu_pd(v + 56);
1405: z2 = _mm256_fmadd_pd(a2, w0, z2);
1407: /* sixth column of a */
1408: w1 = _mm256_set1_pd(work[5]);
1409: a3 = _mm256_loadu_pd(v + 60);
1410: z0 = _mm256_fmadd_pd(a3, w1, z0);
1411: a4 = _mm256_loadu_pd(v + 64);
1412: z1 = _mm256_fmadd_pd(a4, w1, z1);
1413: a5 = _mm256_loadu_pd(v + 68);
1414: z2 = _mm256_fmadd_pd(a5, w1, z2);
1416: /* seventh column of a */
1417: w2 = _mm256_set1_pd(work[6]);
1418: a0 = _mm256_loadu_pd(v + 72);
1419: z0 = _mm256_fmadd_pd(a0, w2, z0);
1420: a1 = _mm256_loadu_pd(v + 76);
1421: z1 = _mm256_fmadd_pd(a1, w2, z1);
1422: a2 = _mm256_loadu_pd(v + 80);
1423: z2 = _mm256_fmadd_pd(a2, w2, z2);
1425: /* eighth column of a */
1426: w3 = _mm256_set1_pd(work[7]);
1427: a3 = _mm256_loadu_pd(v + 84);
1428: z0 = _mm256_fmadd_pd(a3, w3, z0);
1429: a4 = _mm256_loadu_pd(v + 88);
1430: z1 = _mm256_fmadd_pd(a4, w3, z1);
1431: a5 = _mm256_loadu_pd(v + 92);
1432: z2 = _mm256_fmadd_pd(a5, w3, z2);
1434: /* ninth column of a */
1435: w0 = _mm256_set1_pd(work[8]);
1436: a0 = _mm256_loadu_pd(v + 96);
1437: z0 = _mm256_fmadd_pd(a0, w0, z0);
1438: a1 = _mm256_loadu_pd(v + 100);
1439: z1 = _mm256_fmadd_pd(a1, w0, z1);
1440: a2 = _mm256_loadu_pd(v + 104);
1441: z2 = _mm256_fmadd_pd(a2, w0, z2);
1443: /* tenth column of a */
1444: w1 = _mm256_set1_pd(work[9]);
1445: a3 = _mm256_loadu_pd(v + 108);
1446: z0 = _mm256_fmadd_pd(a3, w1, z0);
1447: a4 = _mm256_loadu_pd(v + 112);
1448: z1 = _mm256_fmadd_pd(a4, w1, z1);
1449: a5 = _mm256_loadu_pd(v + 116);
1450: z2 = _mm256_fmadd_pd(a5, w1, z2);
1452: /* eleventh column of a */
1453: w2 = _mm256_set1_pd(work[10]);
1454: a0 = _mm256_loadu_pd(v + 120);
1455: z0 = _mm256_fmadd_pd(a0, w2, z0);
1456: a1 = _mm256_loadu_pd(v + 124);
1457: z1 = _mm256_fmadd_pd(a1, w2, z1);
1458: a2 = _mm256_loadu_pd(v + 128);
1459: z2 = _mm256_fmadd_pd(a2, w2, z2);
1461: /* twelveth column of a */
1462: w3 = _mm256_set1_pd(work[11]);
1463: a3 = _mm256_loadu_pd(v + 132);
1464: z0 = _mm256_fmadd_pd(a3, w3, z0);
1465: a4 = _mm256_loadu_pd(v + 136);
1466: z1 = _mm256_fmadd_pd(a4, w3, z1);
1467: a5 = _mm256_loadu_pd(v + 140);
1468: z2 = _mm256_fmadd_pd(a5, w3, z2);
1470: v += bs2;
1471: }
1472: if (usecprow) z = zarray + bs * ridx[i];
1473: _mm256_storeu_pd(&z[0], z0);
1474: _mm256_storeu_pd(&z[4], z1);
1475: _mm256_storeu_pd(&z[8], z2);
1476: if (!usecprow) z += bs;
1477: }
1478: PetscCall(VecRestoreArrayRead(xx, &x));
1479: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1480: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1481: PetscFunctionReturn(PETSC_SUCCESS);
1482: }
1483: #endif
1485: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1486: /* Default MatMult for block size 15 */
1487: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1488: {
1489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1490: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1491: const PetscScalar *x, *xb;
1492: PetscScalar *zarray, xv;
1493: const MatScalar *v;
1494: const PetscInt *ii, *ij = a->j, *idx;
1495: PetscInt mbs, i, j, k, n, *ridx = NULL;
1496: PetscBool usecprow = a->compressedrow.use;
1498: PetscFunctionBegin;
1499: PetscCall(VecGetArrayRead(xx, &x));
1500: PetscCall(VecGetArrayWrite(zz, &zarray));
1502: v = a->a;
1503: if (usecprow) {
1504: mbs = a->compressedrow.nrows;
1505: ii = a->compressedrow.i;
1506: ridx = a->compressedrow.rindex;
1507: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1508: } else {
1509: mbs = a->mbs;
1510: ii = a->i;
1511: z = zarray;
1512: }
1514: for (i = 0; i < mbs; i++) {
1515: n = ii[i + 1] - ii[i];
1516: idx = ij + ii[i];
1517: sum1 = 0.0;
1518: sum2 = 0.0;
1519: sum3 = 0.0;
1520: sum4 = 0.0;
1521: sum5 = 0.0;
1522: sum6 = 0.0;
1523: sum7 = 0.0;
1524: sum8 = 0.0;
1525: sum9 = 0.0;
1526: sum10 = 0.0;
1527: sum11 = 0.0;
1528: sum12 = 0.0;
1529: sum13 = 0.0;
1530: sum14 = 0.0;
1531: sum15 = 0.0;
1533: for (j = 0; j < n; j++) {
1534: xb = x + 15 * (idx[j]);
1536: for (k = 0; k < 15; k++) {
1537: xv = xb[k];
1538: sum1 += v[0] * xv;
1539: sum2 += v[1] * xv;
1540: sum3 += v[2] * xv;
1541: sum4 += v[3] * xv;
1542: sum5 += v[4] * xv;
1543: sum6 += v[5] * xv;
1544: sum7 += v[6] * xv;
1545: sum8 += v[7] * xv;
1546: sum9 += v[8] * xv;
1547: sum10 += v[9] * xv;
1548: sum11 += v[10] * xv;
1549: sum12 += v[11] * xv;
1550: sum13 += v[12] * xv;
1551: sum14 += v[13] * xv;
1552: sum15 += v[14] * xv;
1553: v += 15;
1554: }
1555: }
1556: if (usecprow) z = zarray + 15 * ridx[i];
1557: z[0] = sum1;
1558: z[1] = sum2;
1559: z[2] = sum3;
1560: z[3] = sum4;
1561: z[4] = sum5;
1562: z[5] = sum6;
1563: z[6] = sum7;
1564: z[7] = sum8;
1565: z[8] = sum9;
1566: z[9] = sum10;
1567: z[10] = sum11;
1568: z[11] = sum12;
1569: z[12] = sum13;
1570: z[13] = sum14;
1571: z[14] = sum15;
1573: if (!usecprow) z += 15;
1574: }
1576: PetscCall(VecRestoreArrayRead(xx, &x));
1577: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1578: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1579: PetscFunctionReturn(PETSC_SUCCESS);
1580: }
1582: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1583: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1584: {
1585: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1586: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1587: const PetscScalar *x, *xb;
1588: PetscScalar x1, x2, x3, x4, *zarray;
1589: const MatScalar *v;
1590: const PetscInt *ii, *ij = a->j, *idx;
1591: PetscInt mbs, i, j, n, *ridx = NULL;
1592: PetscBool usecprow = a->compressedrow.use;
1594: PetscFunctionBegin;
1595: PetscCall(VecGetArrayRead(xx, &x));
1596: PetscCall(VecGetArrayWrite(zz, &zarray));
1598: v = a->a;
1599: if (usecprow) {
1600: mbs = a->compressedrow.nrows;
1601: ii = a->compressedrow.i;
1602: ridx = a->compressedrow.rindex;
1603: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1604: } else {
1605: mbs = a->mbs;
1606: ii = a->i;
1607: z = zarray;
1608: }
1610: for (i = 0; i < mbs; i++) {
1611: n = ii[i + 1] - ii[i];
1612: idx = ij + ii[i];
1613: sum1 = 0.0;
1614: sum2 = 0.0;
1615: sum3 = 0.0;
1616: sum4 = 0.0;
1617: sum5 = 0.0;
1618: sum6 = 0.0;
1619: sum7 = 0.0;
1620: sum8 = 0.0;
1621: sum9 = 0.0;
1622: sum10 = 0.0;
1623: sum11 = 0.0;
1624: sum12 = 0.0;
1625: sum13 = 0.0;
1626: sum14 = 0.0;
1627: sum15 = 0.0;
1629: for (j = 0; j < n; j++) {
1630: xb = x + 15 * (idx[j]);
1631: x1 = xb[0];
1632: x2 = xb[1];
1633: x3 = xb[2];
1634: x4 = xb[3];
1636: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1637: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1638: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1639: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1640: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1641: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1642: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1643: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1644: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1645: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1646: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1647: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1648: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1649: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1650: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1652: v += 60;
1654: x1 = xb[4];
1655: x2 = xb[5];
1656: x3 = xb[6];
1657: x4 = xb[7];
1659: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1660: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1661: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1662: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1663: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1664: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1665: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1666: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1667: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1668: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1669: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1670: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1671: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1672: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1673: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1674: v += 60;
1676: x1 = xb[8];
1677: x2 = xb[9];
1678: x3 = xb[10];
1679: x4 = xb[11];
1680: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1681: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1682: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1683: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1684: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1685: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1686: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1687: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1688: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1689: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1690: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1691: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1692: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1693: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1694: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1695: v += 60;
1697: x1 = xb[12];
1698: x2 = xb[13];
1699: x3 = xb[14];
1700: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1701: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1702: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1703: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1704: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1705: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1706: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1707: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1708: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1709: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1710: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1711: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1712: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1713: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1714: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1715: v += 45;
1716: }
1717: if (usecprow) z = zarray + 15 * ridx[i];
1718: z[0] = sum1;
1719: z[1] = sum2;
1720: z[2] = sum3;
1721: z[3] = sum4;
1722: z[4] = sum5;
1723: z[5] = sum6;
1724: z[6] = sum7;
1725: z[7] = sum8;
1726: z[8] = sum9;
1727: z[9] = sum10;
1728: z[10] = sum11;
1729: z[11] = sum12;
1730: z[12] = sum13;
1731: z[13] = sum14;
1732: z[14] = sum15;
1734: if (!usecprow) z += 15;
1735: }
1737: PetscCall(VecRestoreArrayRead(xx, &x));
1738: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1739: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1740: PetscFunctionReturn(PETSC_SUCCESS);
1741: }
1743: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1744: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1745: {
1746: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1747: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1748: const PetscScalar *x, *xb;
1749: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1750: const MatScalar *v;
1751: const PetscInt *ii, *ij = a->j, *idx;
1752: PetscInt mbs, i, j, n, *ridx = NULL;
1753: PetscBool usecprow = a->compressedrow.use;
1755: PetscFunctionBegin;
1756: PetscCall(VecGetArrayRead(xx, &x));
1757: PetscCall(VecGetArrayWrite(zz, &zarray));
1759: v = a->a;
1760: if (usecprow) {
1761: mbs = a->compressedrow.nrows;
1762: ii = a->compressedrow.i;
1763: ridx = a->compressedrow.rindex;
1764: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1765: } else {
1766: mbs = a->mbs;
1767: ii = a->i;
1768: z = zarray;
1769: }
1771: for (i = 0; i < mbs; i++) {
1772: n = ii[i + 1] - ii[i];
1773: idx = ij + ii[i];
1774: sum1 = 0.0;
1775: sum2 = 0.0;
1776: sum3 = 0.0;
1777: sum4 = 0.0;
1778: sum5 = 0.0;
1779: sum6 = 0.0;
1780: sum7 = 0.0;
1781: sum8 = 0.0;
1782: sum9 = 0.0;
1783: sum10 = 0.0;
1784: sum11 = 0.0;
1785: sum12 = 0.0;
1786: sum13 = 0.0;
1787: sum14 = 0.0;
1788: sum15 = 0.0;
1790: for (j = 0; j < n; j++) {
1791: xb = x + 15 * (idx[j]);
1792: x1 = xb[0];
1793: x2 = xb[1];
1794: x3 = xb[2];
1795: x4 = xb[3];
1796: x5 = xb[4];
1797: x6 = xb[5];
1798: x7 = xb[6];
1799: x8 = xb[7];
1801: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1802: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1803: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1804: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1805: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1806: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1807: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1808: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1809: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1810: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1811: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1812: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1813: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1814: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1815: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1816: v += 120;
1818: x1 = xb[8];
1819: x2 = xb[9];
1820: x3 = xb[10];
1821: x4 = xb[11];
1822: x5 = xb[12];
1823: x6 = xb[13];
1824: x7 = xb[14];
1826: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1827: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1828: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1829: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1830: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1831: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1832: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1833: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1834: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1835: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1836: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1837: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1838: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1839: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1840: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1841: v += 105;
1842: }
1843: if (usecprow) z = zarray + 15 * ridx[i];
1844: z[0] = sum1;
1845: z[1] = sum2;
1846: z[2] = sum3;
1847: z[3] = sum4;
1848: z[4] = sum5;
1849: z[5] = sum6;
1850: z[6] = sum7;
1851: z[7] = sum8;
1852: z[8] = sum9;
1853: z[9] = sum10;
1854: z[10] = sum11;
1855: z[11] = sum12;
1856: z[12] = sum13;
1857: z[13] = sum14;
1858: z[14] = sum15;
1860: if (!usecprow) z += 15;
1861: }
1863: PetscCall(VecRestoreArrayRead(xx, &x));
1864: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1865: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1866: PetscFunctionReturn(PETSC_SUCCESS);
1867: }
1869: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1870: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1871: {
1872: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1873: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1874: const PetscScalar *x, *xb;
1875: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1876: const MatScalar *v;
1877: const PetscInt *ii, *ij = a->j, *idx;
1878: PetscInt mbs, i, j, n, *ridx = NULL;
1879: PetscBool usecprow = a->compressedrow.use;
1881: PetscFunctionBegin;
1882: PetscCall(VecGetArrayRead(xx, &x));
1883: PetscCall(VecGetArrayWrite(zz, &zarray));
1885: v = a->a;
1886: if (usecprow) {
1887: mbs = a->compressedrow.nrows;
1888: ii = a->compressedrow.i;
1889: ridx = a->compressedrow.rindex;
1890: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1891: } else {
1892: mbs = a->mbs;
1893: ii = a->i;
1894: z = zarray;
1895: }
1897: for (i = 0; i < mbs; i++) {
1898: n = ii[i + 1] - ii[i];
1899: idx = ij + ii[i];
1900: sum1 = 0.0;
1901: sum2 = 0.0;
1902: sum3 = 0.0;
1903: sum4 = 0.0;
1904: sum5 = 0.0;
1905: sum6 = 0.0;
1906: sum7 = 0.0;
1907: sum8 = 0.0;
1908: sum9 = 0.0;
1909: sum10 = 0.0;
1910: sum11 = 0.0;
1911: sum12 = 0.0;
1912: sum13 = 0.0;
1913: sum14 = 0.0;
1914: sum15 = 0.0;
1916: for (j = 0; j < n; j++) {
1917: xb = x + 15 * (idx[j]);
1918: x1 = xb[0];
1919: x2 = xb[1];
1920: x3 = xb[2];
1921: x4 = xb[3];
1922: x5 = xb[4];
1923: x6 = xb[5];
1924: x7 = xb[6];
1925: x8 = xb[7];
1926: x9 = xb[8];
1927: x10 = xb[9];
1928: x11 = xb[10];
1929: x12 = xb[11];
1930: x13 = xb[12];
1931: x14 = xb[13];
1932: x15 = xb[14];
1934: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1935: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1936: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1937: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1938: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1939: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1940: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1941: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1942: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1943: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1944: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1945: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1946: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1947: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1948: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1949: v += 225;
1950: }
1951: if (usecprow) z = zarray + 15 * ridx[i];
1952: z[0] = sum1;
1953: z[1] = sum2;
1954: z[2] = sum3;
1955: z[3] = sum4;
1956: z[4] = sum5;
1957: z[5] = sum6;
1958: z[6] = sum7;
1959: z[7] = sum8;
1960: z[8] = sum9;
1961: z[9] = sum10;
1962: z[10] = sum11;
1963: z[11] = sum12;
1964: z[12] = sum13;
1965: z[13] = sum14;
1966: z[14] = sum15;
1968: if (!usecprow) z += 15;
1969: }
1971: PetscCall(VecRestoreArrayRead(xx, &x));
1972: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1973: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1974: PetscFunctionReturn(PETSC_SUCCESS);
1975: }
1977: /*
1978: This will not work with MatScalar == float because it calls the BLAS
1979: */
1980: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1981: {
1982: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1983: PetscScalar *z = NULL, *work, *workt, *zarray;
1984: const PetscScalar *x, *xb;
1985: const MatScalar *v;
1986: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1987: const PetscInt *idx, *ii, *ridx = NULL;
1988: PetscInt ncols, k;
1989: PetscBool usecprow = a->compressedrow.use;
1991: PetscFunctionBegin;
1992: PetscCall(VecGetArrayRead(xx, &x));
1993: PetscCall(VecGetArrayWrite(zz, &zarray));
1995: idx = a->j;
1996: v = a->a;
1997: if (usecprow) {
1998: mbs = a->compressedrow.nrows;
1999: ii = a->compressedrow.i;
2000: ridx = a->compressedrow.rindex;
2001: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2002: } else {
2003: mbs = a->mbs;
2004: ii = a->i;
2005: z = zarray;
2006: }
2008: if (!a->mult_work) {
2009: k = PetscMax(A->rmap->n, A->cmap->n);
2010: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2011: }
2012: work = a->mult_work;
2013: for (i = 0; i < mbs; i++) {
2014: n = ii[1] - ii[0];
2015: ii++;
2016: ncols = n * bs;
2017: workt = work;
2018: for (j = 0; j < n; j++) {
2019: xb = x + bs * (*idx++);
2020: for (k = 0; k < bs; k++) workt[k] = xb[k];
2021: workt += bs;
2022: }
2023: if (usecprow) z = zarray + bs * ridx[i];
2024: PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2025: v += n * bs2;
2026: if (!usecprow) z += bs;
2027: }
2028: PetscCall(VecRestoreArrayRead(xx, &x));
2029: PetscCall(VecRestoreArrayWrite(zz, &zarray));
2030: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2031: PetscFunctionReturn(PETSC_SUCCESS);
2032: }
2034: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2035: {
2036: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2037: const PetscScalar *x;
2038: PetscScalar *y, *z, sum;
2039: const MatScalar *v;
2040: PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2041: const PetscInt *idx, *ii;
2042: PetscBool usecprow = a->compressedrow.use;
2044: PetscFunctionBegin;
2045: PetscCall(VecGetArrayRead(xx, &x));
2046: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2048: idx = a->j;
2049: v = a->a;
2050: if (usecprow) {
2051: if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2052: mbs = a->compressedrow.nrows;
2053: ii = a->compressedrow.i;
2054: ridx = a->compressedrow.rindex;
2055: } else {
2056: ii = a->i;
2057: }
2059: for (i = 0; i < mbs; i++) {
2060: n = ii[1] - ii[0];
2061: ii++;
2062: if (!usecprow) {
2063: sum = y[i];
2064: } else {
2065: sum = y[ridx[i]];
2066: }
2067: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2068: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2069: PetscSparseDensePlusDot(sum, x, v, idx, n);
2070: v += n;
2071: idx += n;
2072: if (usecprow) {
2073: z[ridx[i]] = sum;
2074: } else {
2075: z[i] = sum;
2076: }
2077: }
2078: PetscCall(VecRestoreArrayRead(xx, &x));
2079: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2080: PetscCall(PetscLogFlops(2.0 * a->nz));
2081: PetscFunctionReturn(PETSC_SUCCESS);
2082: }
2084: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2085: {
2086: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2087: PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2088: const PetscScalar *x, *xb;
2089: PetscScalar x1, x2, *yarray, *zarray;
2090: const MatScalar *v;
2091: PetscInt mbs = a->mbs, i, n, j;
2092: const PetscInt *idx, *ii, *ridx = NULL;
2093: PetscBool usecprow = a->compressedrow.use;
2095: PetscFunctionBegin;
2096: PetscCall(VecGetArrayRead(xx, &x));
2097: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2099: idx = a->j;
2100: v = a->a;
2101: if (usecprow) {
2102: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2103: mbs = a->compressedrow.nrows;
2104: ii = a->compressedrow.i;
2105: ridx = a->compressedrow.rindex;
2106: } else {
2107: ii = a->i;
2108: y = yarray;
2109: z = zarray;
2110: }
2112: for (i = 0; i < mbs; i++) {
2113: n = ii[1] - ii[0];
2114: ii++;
2115: if (usecprow) {
2116: z = zarray + 2 * ridx[i];
2117: y = yarray + 2 * ridx[i];
2118: }
2119: sum1 = y[0];
2120: sum2 = y[1];
2121: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2122: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123: for (j = 0; j < n; j++) {
2124: xb = x + 2 * (*idx++);
2125: x1 = xb[0];
2126: x2 = xb[1];
2128: sum1 += v[0] * x1 + v[2] * x2;
2129: sum2 += v[1] * x1 + v[3] * x2;
2130: v += 4;
2131: }
2132: z[0] = sum1;
2133: z[1] = sum2;
2134: if (!usecprow) {
2135: z += 2;
2136: y += 2;
2137: }
2138: }
2139: PetscCall(VecRestoreArrayRead(xx, &x));
2140: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2141: PetscCall(PetscLogFlops(4.0 * a->nz));
2142: PetscFunctionReturn(PETSC_SUCCESS);
2143: }
2145: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2146: {
2147: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2148: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2149: const PetscScalar *x, *xb;
2150: const MatScalar *v;
2151: PetscInt mbs = a->mbs, i, j, n;
2152: const PetscInt *idx, *ii, *ridx = NULL;
2153: PetscBool usecprow = a->compressedrow.use;
2155: PetscFunctionBegin;
2156: PetscCall(VecGetArrayRead(xx, &x));
2157: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2159: idx = a->j;
2160: v = a->a;
2161: if (usecprow) {
2162: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2163: mbs = a->compressedrow.nrows;
2164: ii = a->compressedrow.i;
2165: ridx = a->compressedrow.rindex;
2166: } else {
2167: ii = a->i;
2168: y = yarray;
2169: z = zarray;
2170: }
2172: for (i = 0; i < mbs; i++) {
2173: n = ii[1] - ii[0];
2174: ii++;
2175: if (usecprow) {
2176: z = zarray + 3 * ridx[i];
2177: y = yarray + 3 * ridx[i];
2178: }
2179: sum1 = y[0];
2180: sum2 = y[1];
2181: sum3 = y[2];
2182: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2183: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2184: for (j = 0; j < n; j++) {
2185: xb = x + 3 * (*idx++);
2186: x1 = xb[0];
2187: x2 = xb[1];
2188: x3 = xb[2];
2189: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2190: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2191: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2192: v += 9;
2193: }
2194: z[0] = sum1;
2195: z[1] = sum2;
2196: z[2] = sum3;
2197: if (!usecprow) {
2198: z += 3;
2199: y += 3;
2200: }
2201: }
2202: PetscCall(VecRestoreArrayRead(xx, &x));
2203: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2204: PetscCall(PetscLogFlops(18.0 * a->nz));
2205: PetscFunctionReturn(PETSC_SUCCESS);
2206: }
2208: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2209: {
2210: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2211: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2212: const PetscScalar *x, *xb;
2213: const MatScalar *v;
2214: PetscInt mbs = a->mbs, i, j, n;
2215: const PetscInt *idx, *ii, *ridx = NULL;
2216: PetscBool usecprow = a->compressedrow.use;
2218: PetscFunctionBegin;
2219: PetscCall(VecGetArrayRead(xx, &x));
2220: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2222: idx = a->j;
2223: v = a->a;
2224: if (usecprow) {
2225: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2226: mbs = a->compressedrow.nrows;
2227: ii = a->compressedrow.i;
2228: ridx = a->compressedrow.rindex;
2229: } else {
2230: ii = a->i;
2231: y = yarray;
2232: z = zarray;
2233: }
2235: for (i = 0; i < mbs; i++) {
2236: n = ii[1] - ii[0];
2237: ii++;
2238: if (usecprow) {
2239: z = zarray + 4 * ridx[i];
2240: y = yarray + 4 * ridx[i];
2241: }
2242: sum1 = y[0];
2243: sum2 = y[1];
2244: sum3 = y[2];
2245: sum4 = y[3];
2246: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2247: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2248: for (j = 0; j < n; j++) {
2249: xb = x + 4 * (*idx++);
2250: x1 = xb[0];
2251: x2 = xb[1];
2252: x3 = xb[2];
2253: x4 = xb[3];
2254: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2255: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2256: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2257: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2258: v += 16;
2259: }
2260: z[0] = sum1;
2261: z[1] = sum2;
2262: z[2] = sum3;
2263: z[3] = sum4;
2264: if (!usecprow) {
2265: z += 4;
2266: y += 4;
2267: }
2268: }
2269: PetscCall(VecRestoreArrayRead(xx, &x));
2270: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2271: PetscCall(PetscLogFlops(32.0 * a->nz));
2272: PetscFunctionReturn(PETSC_SUCCESS);
2273: }
2275: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2276: {
2277: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2278: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2279: const PetscScalar *x, *xb;
2280: PetscScalar *yarray, *zarray;
2281: const MatScalar *v;
2282: PetscInt mbs = a->mbs, i, j, n;
2283: const PetscInt *idx, *ii, *ridx = NULL;
2284: PetscBool usecprow = a->compressedrow.use;
2286: PetscFunctionBegin;
2287: PetscCall(VecGetArrayRead(xx, &x));
2288: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2290: idx = a->j;
2291: v = a->a;
2292: if (usecprow) {
2293: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2294: mbs = a->compressedrow.nrows;
2295: ii = a->compressedrow.i;
2296: ridx = a->compressedrow.rindex;
2297: } else {
2298: ii = a->i;
2299: y = yarray;
2300: z = zarray;
2301: }
2303: for (i = 0; i < mbs; i++) {
2304: n = ii[1] - ii[0];
2305: ii++;
2306: if (usecprow) {
2307: z = zarray + 5 * ridx[i];
2308: y = yarray + 5 * ridx[i];
2309: }
2310: sum1 = y[0];
2311: sum2 = y[1];
2312: sum3 = y[2];
2313: sum4 = y[3];
2314: sum5 = y[4];
2315: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2316: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2317: for (j = 0; j < n; j++) {
2318: xb = x + 5 * (*idx++);
2319: x1 = xb[0];
2320: x2 = xb[1];
2321: x3 = xb[2];
2322: x4 = xb[3];
2323: x5 = xb[4];
2324: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2325: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2326: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2327: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2328: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2329: v += 25;
2330: }
2331: z[0] = sum1;
2332: z[1] = sum2;
2333: z[2] = sum3;
2334: z[3] = sum4;
2335: z[4] = sum5;
2336: if (!usecprow) {
2337: z += 5;
2338: y += 5;
2339: }
2340: }
2341: PetscCall(VecRestoreArrayRead(xx, &x));
2342: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2343: PetscCall(PetscLogFlops(50.0 * a->nz));
2344: PetscFunctionReturn(PETSC_SUCCESS);
2345: }
2347: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2348: {
2349: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2350: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2351: const PetscScalar *x, *xb;
2352: PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2353: const MatScalar *v;
2354: PetscInt mbs = a->mbs, i, j, n;
2355: const PetscInt *idx, *ii, *ridx = NULL;
2356: PetscBool usecprow = a->compressedrow.use;
2358: PetscFunctionBegin;
2359: PetscCall(VecGetArrayRead(xx, &x));
2360: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2362: idx = a->j;
2363: v = a->a;
2364: if (usecprow) {
2365: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2366: mbs = a->compressedrow.nrows;
2367: ii = a->compressedrow.i;
2368: ridx = a->compressedrow.rindex;
2369: } else {
2370: ii = a->i;
2371: y = yarray;
2372: z = zarray;
2373: }
2375: for (i = 0; i < mbs; i++) {
2376: n = ii[1] - ii[0];
2377: ii++;
2378: if (usecprow) {
2379: z = zarray + 6 * ridx[i];
2380: y = yarray + 6 * ridx[i];
2381: }
2382: sum1 = y[0];
2383: sum2 = y[1];
2384: sum3 = y[2];
2385: sum4 = y[3];
2386: sum5 = y[4];
2387: sum6 = y[5];
2388: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2389: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390: for (j = 0; j < n; j++) {
2391: xb = x + 6 * (*idx++);
2392: x1 = xb[0];
2393: x2 = xb[1];
2394: x3 = xb[2];
2395: x4 = xb[3];
2396: x5 = xb[4];
2397: x6 = xb[5];
2398: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404: v += 36;
2405: }
2406: z[0] = sum1;
2407: z[1] = sum2;
2408: z[2] = sum3;
2409: z[3] = sum4;
2410: z[4] = sum5;
2411: z[5] = sum6;
2412: if (!usecprow) {
2413: z += 6;
2414: y += 6;
2415: }
2416: }
2417: PetscCall(VecRestoreArrayRead(xx, &x));
2418: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2419: PetscCall(PetscLogFlops(72.0 * a->nz));
2420: PetscFunctionReturn(PETSC_SUCCESS);
2421: }
2423: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424: {
2425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2426: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427: const PetscScalar *x, *xb;
2428: PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429: const MatScalar *v;
2430: PetscInt mbs = a->mbs, i, j, n;
2431: const PetscInt *idx, *ii, *ridx = NULL;
2432: PetscBool usecprow = a->compressedrow.use;
2434: PetscFunctionBegin;
2435: PetscCall(VecGetArrayRead(xx, &x));
2436: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2438: idx = a->j;
2439: v = a->a;
2440: if (usecprow) {
2441: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2442: mbs = a->compressedrow.nrows;
2443: ii = a->compressedrow.i;
2444: ridx = a->compressedrow.rindex;
2445: } else {
2446: ii = a->i;
2447: y = yarray;
2448: z = zarray;
2449: }
2451: for (i = 0; i < mbs; i++) {
2452: n = ii[1] - ii[0];
2453: ii++;
2454: if (usecprow) {
2455: z = zarray + 7 * ridx[i];
2456: y = yarray + 7 * ridx[i];
2457: }
2458: sum1 = y[0];
2459: sum2 = y[1];
2460: sum3 = y[2];
2461: sum4 = y[3];
2462: sum5 = y[4];
2463: sum6 = y[5];
2464: sum7 = y[6];
2465: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2466: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2467: for (j = 0; j < n; j++) {
2468: xb = x + 7 * (*idx++);
2469: x1 = xb[0];
2470: x2 = xb[1];
2471: x3 = xb[2];
2472: x4 = xb[3];
2473: x5 = xb[4];
2474: x6 = xb[5];
2475: x7 = xb[6];
2476: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2477: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2478: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2479: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2480: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2481: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2482: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2483: v += 49;
2484: }
2485: z[0] = sum1;
2486: z[1] = sum2;
2487: z[2] = sum3;
2488: z[3] = sum4;
2489: z[4] = sum5;
2490: z[5] = sum6;
2491: z[6] = sum7;
2492: if (!usecprow) {
2493: z += 7;
2494: y += 7;
2495: }
2496: }
2497: PetscCall(VecRestoreArrayRead(xx, &x));
2498: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2499: PetscCall(PetscLogFlops(98.0 * a->nz));
2500: PetscFunctionReturn(PETSC_SUCCESS);
2501: }
2503: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2504: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2505: {
2506: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2507: PetscScalar *z = NULL, *work, *workt, *zarray;
2508: const PetscScalar *x, *xb;
2509: const MatScalar *v;
2510: PetscInt mbs, i, j, n;
2511: PetscInt k;
2512: PetscBool usecprow = a->compressedrow.use;
2513: const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2515: __m256d a0, a1, a2, a3, a4, a5;
2516: __m256d w0, w1, w2, w3;
2517: __m256d z0, z1, z2;
2518: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2520: PetscFunctionBegin;
2521: PetscCall(VecCopy(yy, zz));
2522: PetscCall(VecGetArrayRead(xx, &x));
2523: PetscCall(VecGetArray(zz, &zarray));
2525: idx = a->j;
2526: v = a->a;
2527: if (usecprow) {
2528: mbs = a->compressedrow.nrows;
2529: ii = a->compressedrow.i;
2530: ridx = a->compressedrow.rindex;
2531: } else {
2532: mbs = a->mbs;
2533: ii = a->i;
2534: z = zarray;
2535: }
2537: if (!a->mult_work) {
2538: k = PetscMax(A->rmap->n, A->cmap->n);
2539: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2540: }
2542: work = a->mult_work;
2543: for (i = 0; i < mbs; i++) {
2544: n = ii[1] - ii[0];
2545: ii++;
2546: workt = work;
2547: for (j = 0; j < n; j++) {
2548: xb = x + bs * (*idx++);
2549: for (k = 0; k < bs; k++) workt[k] = xb[k];
2550: workt += bs;
2551: }
2552: if (usecprow) z = zarray + bs * ridx[i];
2554: z0 = _mm256_loadu_pd(&z[0]);
2555: z1 = _mm256_loadu_pd(&z[4]);
2556: z2 = _mm256_set1_pd(z[8]);
2558: for (j = 0; j < n; j++) {
2559: /* first column of a */
2560: w0 = _mm256_set1_pd(work[j * 9]);
2561: a0 = _mm256_loadu_pd(&v[j * 81]);
2562: z0 = _mm256_fmadd_pd(a0, w0, z0);
2563: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2564: z1 = _mm256_fmadd_pd(a1, w0, z1);
2565: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2566: z2 = _mm256_fmadd_pd(a2, w0, z2);
2568: /* second column of a */
2569: w1 = _mm256_set1_pd(work[j * 9 + 1]);
2570: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2571: z0 = _mm256_fmadd_pd(a0, w1, z0);
2572: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2573: z1 = _mm256_fmadd_pd(a1, w1, z1);
2574: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2575: z2 = _mm256_fmadd_pd(a2, w1, z2);
2577: /* third column of a */
2578: w2 = _mm256_set1_pd(work[j * 9 + 2]);
2579: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2580: z0 = _mm256_fmadd_pd(a3, w2, z0);
2581: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2582: z1 = _mm256_fmadd_pd(a4, w2, z1);
2583: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2584: z2 = _mm256_fmadd_pd(a5, w2, z2);
2586: /* fourth column of a */
2587: w3 = _mm256_set1_pd(work[j * 9 + 3]);
2588: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2589: z0 = _mm256_fmadd_pd(a0, w3, z0);
2590: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2591: z1 = _mm256_fmadd_pd(a1, w3, z1);
2592: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2593: z2 = _mm256_fmadd_pd(a2, w3, z2);
2595: /* fifth column of a */
2596: w0 = _mm256_set1_pd(work[j * 9 + 4]);
2597: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2598: z0 = _mm256_fmadd_pd(a3, w0, z0);
2599: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2600: z1 = _mm256_fmadd_pd(a4, w0, z1);
2601: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2602: z2 = _mm256_fmadd_pd(a5, w0, z2);
2604: /* sixth column of a */
2605: w1 = _mm256_set1_pd(work[j * 9 + 5]);
2606: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2607: z0 = _mm256_fmadd_pd(a0, w1, z0);
2608: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2609: z1 = _mm256_fmadd_pd(a1, w1, z1);
2610: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2611: z2 = _mm256_fmadd_pd(a2, w1, z2);
2613: /* seventh column of a */
2614: w2 = _mm256_set1_pd(work[j * 9 + 6]);
2615: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2616: z0 = _mm256_fmadd_pd(a0, w2, z0);
2617: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2618: z1 = _mm256_fmadd_pd(a1, w2, z1);
2619: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2620: z2 = _mm256_fmadd_pd(a2, w2, z2);
2622: /* eighth column of a */
2623: w3 = _mm256_set1_pd(work[j * 9 + 7]);
2624: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2625: z0 = _mm256_fmadd_pd(a3, w3, z0);
2626: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2627: z1 = _mm256_fmadd_pd(a4, w3, z1);
2628: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2629: z2 = _mm256_fmadd_pd(a5, w3, z2);
2631: /* ninth column of a */
2632: w0 = _mm256_set1_pd(work[j * 9 + 8]);
2633: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2634: z0 = _mm256_fmadd_pd(a0, w0, z0);
2635: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2636: z1 = _mm256_fmadd_pd(a1, w0, z1);
2637: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2638: z2 = _mm256_fmadd_pd(a2, w0, z2);
2639: }
2641: _mm256_storeu_pd(&z[0], z0);
2642: _mm256_storeu_pd(&z[4], z1);
2643: _mm256_maskstore_pd(&z[8], mask1, z2);
2645: v += n * bs2;
2646: if (!usecprow) z += bs;
2647: }
2648: PetscCall(VecRestoreArrayRead(xx, &x));
2649: PetscCall(VecRestoreArray(zz, &zarray));
2650: PetscCall(PetscLogFlops(162.0 * a->nz));
2651: PetscFunctionReturn(PETSC_SUCCESS);
2652: }
2653: #endif
2655: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2656: {
2657: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2658: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2659: const PetscScalar *x, *xb;
2660: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2661: const MatScalar *v;
2662: PetscInt mbs = a->mbs, i, j, n;
2663: const PetscInt *idx, *ii, *ridx = NULL;
2664: PetscBool usecprow = a->compressedrow.use;
2666: PetscFunctionBegin;
2667: PetscCall(VecGetArrayRead(xx, &x));
2668: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2670: idx = a->j;
2671: v = a->a;
2672: if (usecprow) {
2673: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2674: mbs = a->compressedrow.nrows;
2675: ii = a->compressedrow.i;
2676: ridx = a->compressedrow.rindex;
2677: } else {
2678: ii = a->i;
2679: y = yarray;
2680: z = zarray;
2681: }
2683: for (i = 0; i < mbs; i++) {
2684: n = ii[1] - ii[0];
2685: ii++;
2686: if (usecprow) {
2687: z = zarray + 11 * ridx[i];
2688: y = yarray + 11 * ridx[i];
2689: }
2690: sum1 = y[0];
2691: sum2 = y[1];
2692: sum3 = y[2];
2693: sum4 = y[3];
2694: sum5 = y[4];
2695: sum6 = y[5];
2696: sum7 = y[6];
2697: sum8 = y[7];
2698: sum9 = y[8];
2699: sum10 = y[9];
2700: sum11 = y[10];
2701: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2702: PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2703: for (j = 0; j < n; j++) {
2704: xb = x + 11 * (*idx++);
2705: x1 = xb[0];
2706: x2 = xb[1];
2707: x3 = xb[2];
2708: x4 = xb[3];
2709: x5 = xb[4];
2710: x6 = xb[5];
2711: x7 = xb[6];
2712: x8 = xb[7];
2713: x9 = xb[8];
2714: x10 = xb[9];
2715: x11 = xb[10];
2716: sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2717: sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2718: sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2719: sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2720: sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2721: sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2722: sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2723: sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2724: sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2725: sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2726: sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2727: v += 121;
2728: }
2729: z[0] = sum1;
2730: z[1] = sum2;
2731: z[2] = sum3;
2732: z[3] = sum4;
2733: z[4] = sum5;
2734: z[5] = sum6;
2735: z[6] = sum7;
2736: z[7] = sum8;
2737: z[8] = sum9;
2738: z[9] = sum10;
2739: z[10] = sum11;
2740: if (!usecprow) {
2741: z += 11;
2742: y += 11;
2743: }
2744: }
2745: PetscCall(VecRestoreArrayRead(xx, &x));
2746: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2747: PetscCall(PetscLogFlops(242.0 * a->nz));
2748: PetscFunctionReturn(PETSC_SUCCESS);
2749: }
2751: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2752: {
2753: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2754: PetscScalar *z = NULL, *work, *workt, *zarray;
2755: const PetscScalar *x, *xb;
2756: const MatScalar *v;
2757: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2758: PetscInt ncols, k;
2759: const PetscInt *ridx = NULL, *idx, *ii;
2760: PetscBool usecprow = a->compressedrow.use;
2762: PetscFunctionBegin;
2763: PetscCall(VecCopy(yy, zz));
2764: PetscCall(VecGetArrayRead(xx, &x));
2765: PetscCall(VecGetArray(zz, &zarray));
2767: idx = a->j;
2768: v = a->a;
2769: if (usecprow) {
2770: mbs = a->compressedrow.nrows;
2771: ii = a->compressedrow.i;
2772: ridx = a->compressedrow.rindex;
2773: } else {
2774: mbs = a->mbs;
2775: ii = a->i;
2776: z = zarray;
2777: }
2779: if (!a->mult_work) {
2780: k = PetscMax(A->rmap->n, A->cmap->n);
2781: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2782: }
2783: work = a->mult_work;
2784: for (i = 0; i < mbs; i++) {
2785: n = ii[1] - ii[0];
2786: ii++;
2787: ncols = n * bs;
2788: workt = work;
2789: for (j = 0; j < n; j++) {
2790: xb = x + bs * (*idx++);
2791: for (k = 0; k < bs; k++) workt[k] = xb[k];
2792: workt += bs;
2793: }
2794: if (usecprow) z = zarray + bs * ridx[i];
2795: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2796: v += n * bs2;
2797: if (!usecprow) z += bs;
2798: }
2799: PetscCall(VecRestoreArrayRead(xx, &x));
2800: PetscCall(VecRestoreArray(zz, &zarray));
2801: PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2802: PetscFunctionReturn(PETSC_SUCCESS);
2803: }
2805: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2806: {
2807: PetscScalar zero = 0.0;
2809: PetscFunctionBegin;
2810: PetscCall(VecSet(zz, zero));
2811: PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812: PetscFunctionReturn(PETSC_SUCCESS);
2813: }
2815: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2816: {
2817: PetscScalar zero = 0.0;
2819: PetscFunctionBegin;
2820: PetscCall(VecSet(zz, zero));
2821: PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2822: PetscFunctionReturn(PETSC_SUCCESS);
2823: }
2825: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826: {
2827: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2828: PetscScalar *z, x1, x2, x3, x4, x5;
2829: const PetscScalar *x, *xb = NULL;
2830: const MatScalar *v;
2831: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2832: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2833: Mat_CompressedRow cprow = a->compressedrow;
2834: PetscBool usecprow = cprow.use;
2836: PetscFunctionBegin;
2837: if (yy != zz) PetscCall(VecCopy(yy, zz));
2838: PetscCall(VecGetArrayRead(xx, &x));
2839: PetscCall(VecGetArray(zz, &z));
2841: idx = a->j;
2842: v = a->a;
2843: if (usecprow) {
2844: mbs = cprow.nrows;
2845: ii = cprow.i;
2846: ridx = cprow.rindex;
2847: } else {
2848: mbs = a->mbs;
2849: ii = a->i;
2850: xb = x;
2851: }
2853: switch (bs) {
2854: case 1:
2855: for (i = 0; i < mbs; i++) {
2856: if (usecprow) xb = x + ridx[i];
2857: x1 = xb[0];
2858: ib = idx + ii[0];
2859: n = ii[1] - ii[0];
2860: ii++;
2861: for (j = 0; j < n; j++) {
2862: rval = ib[j];
2863: z[rval] += PetscConj(*v) * x1;
2864: v++;
2865: }
2866: if (!usecprow) xb++;
2867: }
2868: break;
2869: case 2:
2870: for (i = 0; i < mbs; i++) {
2871: if (usecprow) xb = x + 2 * ridx[i];
2872: x1 = xb[0];
2873: x2 = xb[1];
2874: ib = idx + ii[0];
2875: n = ii[1] - ii[0];
2876: ii++;
2877: for (j = 0; j < n; j++) {
2878: rval = ib[j] * 2;
2879: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2880: z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2881: v += 4;
2882: }
2883: if (!usecprow) xb += 2;
2884: }
2885: break;
2886: case 3:
2887: for (i = 0; i < mbs; i++) {
2888: if (usecprow) xb = x + 3 * ridx[i];
2889: x1 = xb[0];
2890: x2 = xb[1];
2891: x3 = xb[2];
2892: ib = idx + ii[0];
2893: n = ii[1] - ii[0];
2894: ii++;
2895: for (j = 0; j < n; j++) {
2896: rval = ib[j] * 3;
2897: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2898: z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2899: z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2900: v += 9;
2901: }
2902: if (!usecprow) xb += 3;
2903: }
2904: break;
2905: case 4:
2906: for (i = 0; i < mbs; i++) {
2907: if (usecprow) xb = x + 4 * ridx[i];
2908: x1 = xb[0];
2909: x2 = xb[1];
2910: x3 = xb[2];
2911: x4 = xb[3];
2912: ib = idx + ii[0];
2913: n = ii[1] - ii[0];
2914: ii++;
2915: for (j = 0; j < n; j++) {
2916: rval = ib[j] * 4;
2917: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2918: z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2919: z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2920: z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2921: v += 16;
2922: }
2923: if (!usecprow) xb += 4;
2924: }
2925: break;
2926: case 5:
2927: for (i = 0; i < mbs; i++) {
2928: if (usecprow) xb = x + 5 * ridx[i];
2929: x1 = xb[0];
2930: x2 = xb[1];
2931: x3 = xb[2];
2932: x4 = xb[3];
2933: x5 = xb[4];
2934: ib = idx + ii[0];
2935: n = ii[1] - ii[0];
2936: ii++;
2937: for (j = 0; j < n; j++) {
2938: rval = ib[j] * 5;
2939: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2940: z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2941: z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2942: z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2943: z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2944: v += 25;
2945: }
2946: if (!usecprow) xb += 5;
2947: }
2948: break;
2949: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2950: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2951: #if 0
2952: {
2953: PetscInt ncols,k,bs2=a->bs2;
2954: PetscScalar *work,*workt,zb;
2955: const PetscScalar *xtmp;
2956: if (!a->mult_work) {
2957: k = PetscMax(A->rmap->n,A->cmap->n);
2958: PetscCall(PetscMalloc1(k+1,&a->mult_work));
2959: }
2960: work = a->mult_work;
2961: xtmp = x;
2962: for (i=0; i<mbs; i++) {
2963: n = ii[1] - ii[0]; ii++;
2964: ncols = n*bs;
2965: PetscCall(PetscArrayzero(work,ncols));
2966: if (usecprow) xtmp = x + bs*ridx[i];
2967: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2968: v += n*bs2;
2969: if (!usecprow) xtmp += bs;
2970: workt = work;
2971: for (j=0; j<n; j++) {
2972: zb = z + bs*(*idx++);
2973: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2974: workt += bs;
2975: }
2976: }
2977: }
2978: #endif
2979: }
2980: PetscCall(VecRestoreArrayRead(xx, &x));
2981: PetscCall(VecRestoreArray(zz, &z));
2982: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2983: PetscFunctionReturn(PETSC_SUCCESS);
2984: }
2986: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2987: {
2988: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2989: PetscScalar *zb, *z, x1, x2, x3, x4, x5;
2990: const PetscScalar *x, *xb = NULL;
2991: const MatScalar *v;
2992: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2993: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2994: Mat_CompressedRow cprow = a->compressedrow;
2995: PetscBool usecprow = cprow.use;
2997: PetscFunctionBegin;
2998: if (yy != zz) PetscCall(VecCopy(yy, zz));
2999: PetscCall(VecGetArrayRead(xx, &x));
3000: PetscCall(VecGetArray(zz, &z));
3002: idx = a->j;
3003: v = a->a;
3004: if (usecprow) {
3005: mbs = cprow.nrows;
3006: ii = cprow.i;
3007: ridx = cprow.rindex;
3008: } else {
3009: mbs = a->mbs;
3010: ii = a->i;
3011: xb = x;
3012: }
3014: switch (bs) {
3015: case 1:
3016: for (i = 0; i < mbs; i++) {
3017: if (usecprow) xb = x + ridx[i];
3018: x1 = xb[0];
3019: ib = idx + ii[0];
3020: n = ii[1] - ii[0];
3021: ii++;
3022: for (j = 0; j < n; j++) {
3023: rval = ib[j];
3024: z[rval] += *v * x1;
3025: v++;
3026: }
3027: if (!usecprow) xb++;
3028: }
3029: break;
3030: case 2:
3031: for (i = 0; i < mbs; i++) {
3032: if (usecprow) xb = x + 2 * ridx[i];
3033: x1 = xb[0];
3034: x2 = xb[1];
3035: ib = idx + ii[0];
3036: n = ii[1] - ii[0];
3037: ii++;
3038: for (j = 0; j < n; j++) {
3039: rval = ib[j] * 2;
3040: z[rval++] += v[0] * x1 + v[1] * x2;
3041: z[rval++] += v[2] * x1 + v[3] * x2;
3042: v += 4;
3043: }
3044: if (!usecprow) xb += 2;
3045: }
3046: break;
3047: case 3:
3048: for (i = 0; i < mbs; i++) {
3049: if (usecprow) xb = x + 3 * ridx[i];
3050: x1 = xb[0];
3051: x2 = xb[1];
3052: x3 = xb[2];
3053: ib = idx + ii[0];
3054: n = ii[1] - ii[0];
3055: ii++;
3056: for (j = 0; j < n; j++) {
3057: rval = ib[j] * 3;
3058: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3059: z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3060: z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3061: v += 9;
3062: }
3063: if (!usecprow) xb += 3;
3064: }
3065: break;
3066: case 4:
3067: for (i = 0; i < mbs; i++) {
3068: if (usecprow) xb = x + 4 * ridx[i];
3069: x1 = xb[0];
3070: x2 = xb[1];
3071: x3 = xb[2];
3072: x4 = xb[3];
3073: ib = idx + ii[0];
3074: n = ii[1] - ii[0];
3075: ii++;
3076: for (j = 0; j < n; j++) {
3077: rval = ib[j] * 4;
3078: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3079: z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3080: z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3081: z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3082: v += 16;
3083: }
3084: if (!usecprow) xb += 4;
3085: }
3086: break;
3087: case 5:
3088: for (i = 0; i < mbs; i++) {
3089: if (usecprow) xb = x + 5 * ridx[i];
3090: x1 = xb[0];
3091: x2 = xb[1];
3092: x3 = xb[2];
3093: x4 = xb[3];
3094: x5 = xb[4];
3095: ib = idx + ii[0];
3096: n = ii[1] - ii[0];
3097: ii++;
3098: for (j = 0; j < n; j++) {
3099: rval = ib[j] * 5;
3100: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3101: z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3102: z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3103: z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3104: z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3105: v += 25;
3106: }
3107: if (!usecprow) xb += 5;
3108: }
3109: break;
3110: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3111: PetscInt ncols, k;
3112: PetscScalar *work, *workt;
3113: const PetscScalar *xtmp;
3114: if (!a->mult_work) {
3115: k = PetscMax(A->rmap->n, A->cmap->n);
3116: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3117: }
3118: work = a->mult_work;
3119: xtmp = x;
3120: for (i = 0; i < mbs; i++) {
3121: n = ii[1] - ii[0];
3122: ii++;
3123: ncols = n * bs;
3124: PetscCall(PetscArrayzero(work, ncols));
3125: if (usecprow) xtmp = x + bs * ridx[i];
3126: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3127: v += n * bs2;
3128: if (!usecprow) xtmp += bs;
3129: workt = work;
3130: for (j = 0; j < n; j++) {
3131: zb = z + bs * (*idx++);
3132: for (k = 0; k < bs; k++) zb[k] += workt[k];
3133: workt += bs;
3134: }
3135: }
3136: }
3137: }
3138: PetscCall(VecRestoreArrayRead(xx, &x));
3139: PetscCall(VecRestoreArray(zz, &z));
3140: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3141: PetscFunctionReturn(PETSC_SUCCESS);
3142: }
3144: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3145: {
3146: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3147: PetscInt totalnz = a->bs2 * a->nz;
3148: PetscScalar oalpha = alpha;
3149: PetscBLASInt one = 1, tnz;
3151: PetscFunctionBegin;
3152: PetscCall(PetscBLASIntCast(totalnz, &tnz));
3153: PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3154: PetscCall(PetscLogFlops(totalnz));
3155: PetscFunctionReturn(PETSC_SUCCESS);
3156: }
3158: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3159: {
3160: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3161: MatScalar *v = a->a;
3162: PetscReal sum = 0.0;
3163: PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3165: PetscFunctionBegin;
3166: if (type == NORM_FROBENIUS) {
3167: #if defined(PETSC_USE_REAL___FP16)
3168: PetscBLASInt one = 1, cnt = bs2 * nz;
3169: PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3170: #else
3171: for (i = 0; i < bs2 * nz; i++) {
3172: sum += PetscRealPart(PetscConj(*v) * (*v));
3173: v++;
3174: }
3175: #endif
3176: *norm = PetscSqrtReal(sum);
3177: PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3178: } else if (type == NORM_1) { /* maximum column sum */
3179: PetscReal *tmp;
3180: PetscInt *bcol = a->j;
3181: PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3182: for (i = 0; i < nz; i++) {
3183: for (j = 0; j < bs; j++) {
3184: k1 = bs * (*bcol) + j; /* column index */
3185: for (k = 0; k < bs; k++) {
3186: tmp[k1] += PetscAbsScalar(*v);
3187: v++;
3188: }
3189: }
3190: bcol++;
3191: }
3192: *norm = 0.0;
3193: for (j = 0; j < A->cmap->n; j++) {
3194: if (tmp[j] > *norm) *norm = tmp[j];
3195: }
3196: PetscCall(PetscFree(tmp));
3197: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3198: } else if (type == NORM_INFINITY) { /* maximum row sum */
3199: *norm = 0.0;
3200: for (k = 0; k < bs; k++) {
3201: for (j = 0; j < a->mbs; j++) {
3202: v = a->a + bs2 * a->i[j] + k;
3203: sum = 0.0;
3204: for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3205: for (k1 = 0; k1 < bs; k1++) {
3206: sum += PetscAbsScalar(*v);
3207: v += bs;
3208: }
3209: }
3210: if (sum > *norm) *norm = sum;
3211: }
3212: }
3213: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3214: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3215: PetscFunctionReturn(PETSC_SUCCESS);
3216: }
3218: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3219: {
3220: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3222: PetscFunctionBegin;
3223: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
3224: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3225: *flg = PETSC_FALSE;
3226: PetscFunctionReturn(PETSC_SUCCESS);
3227: }
3229: /* if the a->i are the same */
3230: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3231: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3233: /* if a->j are the same */
3234: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3235: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3237: /* if a->a are the same */
3238: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3239: PetscFunctionReturn(PETSC_SUCCESS);
3240: }
3242: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3243: {
3244: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3245: PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3246: PetscScalar *x, zero = 0.0;
3247: MatScalar *aa, *aa_j;
3249: PetscFunctionBegin;
3250: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3251: bs = A->rmap->bs;
3252: aa = a->a;
3253: ai = a->i;
3254: aj = a->j;
3255: ambs = a->mbs;
3256: bs2 = a->bs2;
3258: PetscCall(VecSet(v, zero));
3259: PetscCall(VecGetArray(v, &x));
3260: PetscCall(VecGetLocalSize(v, &n));
3261: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3262: for (i = 0; i < ambs; i++) {
3263: for (j = ai[i]; j < ai[i + 1]; j++) {
3264: if (aj[j] == i) {
3265: row = i * bs;
3266: aa_j = aa + j * bs2;
3267: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3268: break;
3269: }
3270: }
3271: }
3272: PetscCall(VecRestoreArray(v, &x));
3273: PetscFunctionReturn(PETSC_SUCCESS);
3274: }
3276: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3277: {
3278: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3279: const PetscScalar *l, *r, *li, *ri;
3280: PetscScalar x;
3281: MatScalar *aa, *v;
3282: PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3283: const PetscInt *ai, *aj;
3285: PetscFunctionBegin;
3286: ai = a->i;
3287: aj = a->j;
3288: aa = a->a;
3289: m = A->rmap->n;
3290: n = A->cmap->n;
3291: bs = A->rmap->bs;
3292: mbs = a->mbs;
3293: bs2 = a->bs2;
3294: if (ll) {
3295: PetscCall(VecGetArrayRead(ll, &l));
3296: PetscCall(VecGetLocalSize(ll, &lm));
3297: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3298: for (i = 0; i < mbs; i++) { /* for each block row */
3299: M = ai[i + 1] - ai[i];
3300: li = l + i * bs;
3301: v = PetscSafePointerPlusOffset(aa, bs2 * ai[i]);
3302: for (j = 0; j < M; j++) { /* for each block */
3303: for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3304: }
3305: }
3306: PetscCall(VecRestoreArrayRead(ll, &l));
3307: PetscCall(PetscLogFlops(a->nz));
3308: }
3310: if (rr) {
3311: PetscCall(VecGetArrayRead(rr, &r));
3312: PetscCall(VecGetLocalSize(rr, &rn));
3313: PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3314: for (i = 0; i < mbs; i++) { /* for each block row */
3315: iai = ai[i];
3316: M = ai[i + 1] - iai;
3317: v = PetscSafePointerPlusOffset(aa, bs2 * iai);
3318: for (j = 0; j < M; j++) { /* for each block */
3319: ri = r + bs * aj[iai + j];
3320: for (k = 0; k < bs; k++) {
3321: x = ri[k];
3322: for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3323: v += bs;
3324: }
3325: }
3326: }
3327: PetscCall(VecRestoreArrayRead(rr, &r));
3328: PetscCall(PetscLogFlops(a->nz));
3329: }
3330: PetscFunctionReturn(PETSC_SUCCESS);
3331: }
3333: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3334: {
3335: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3337: PetscFunctionBegin;
3338: info->block_size = a->bs2;
3339: info->nz_allocated = a->bs2 * a->maxnz;
3340: info->nz_used = a->bs2 * a->nz;
3341: info->nz_unneeded = info->nz_allocated - info->nz_used;
3342: info->assemblies = A->num_ass;
3343: info->mallocs = A->info.mallocs;
3344: info->memory = 0; /* REVIEW ME */
3345: if (A->factortype) {
3346: info->fill_ratio_given = A->info.fill_ratio_given;
3347: info->fill_ratio_needed = A->info.fill_ratio_needed;
3348: info->factor_mallocs = A->info.factor_mallocs;
3349: } else {
3350: info->fill_ratio_given = 0;
3351: info->fill_ratio_needed = 0;
3352: info->factor_mallocs = 0;
3353: }
3354: PetscFunctionReturn(PETSC_SUCCESS);
3355: }
3357: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3358: {
3359: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3361: PetscFunctionBegin;
3362: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3363: PetscFunctionReturn(PETSC_SUCCESS);
3364: }
3366: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3367: {
3368: PetscFunctionBegin;
3369: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3370: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3371: PetscFunctionReturn(PETSC_SUCCESS);
3372: }
3374: static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3375: {
3376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3377: PetscScalar *z = NULL, sum1;
3378: const PetscScalar *xb;
3379: PetscScalar x1;
3380: const MatScalar *v, *vv;
3381: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3382: PetscBool usecprow = a->compressedrow.use;
3384: PetscFunctionBegin;
3385: idx = a->j;
3386: v = a->a;
3387: if (usecprow) {
3388: mbs = a->compressedrow.nrows;
3389: ii = a->compressedrow.i;
3390: ridx = a->compressedrow.rindex;
3391: } else {
3392: mbs = a->mbs;
3393: ii = a->i;
3394: z = c;
3395: }
3397: for (i = 0; i < mbs; i++) {
3398: n = ii[1] - ii[0];
3399: ii++;
3400: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3401: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3402: if (usecprow) z = c + ridx[i];
3403: jj = idx;
3404: vv = v;
3405: for (k = 0; k < cn; k++) {
3406: idx = jj;
3407: v = vv;
3408: sum1 = 0.0;
3409: for (j = 0; j < n; j++) {
3410: xb = b + (*idx++);
3411: x1 = xb[0 + k * bm];
3412: sum1 += v[0] * x1;
3413: v += 1;
3414: }
3415: z[0 + k * cm] = sum1;
3416: }
3417: if (!usecprow) z += 1;
3418: }
3419: PetscFunctionReturn(PETSC_SUCCESS);
3420: }
3422: static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3423: {
3424: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3425: PetscScalar *z = NULL, sum1, sum2;
3426: const PetscScalar *xb;
3427: PetscScalar x1, x2;
3428: const MatScalar *v, *vv;
3429: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3430: PetscBool usecprow = a->compressedrow.use;
3432: PetscFunctionBegin;
3433: idx = a->j;
3434: v = a->a;
3435: if (usecprow) {
3436: mbs = a->compressedrow.nrows;
3437: ii = a->compressedrow.i;
3438: ridx = a->compressedrow.rindex;
3439: } else {
3440: mbs = a->mbs;
3441: ii = a->i;
3442: z = c;
3443: }
3445: for (i = 0; i < mbs; i++) {
3446: n = ii[1] - ii[0];
3447: ii++;
3448: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3449: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3450: if (usecprow) z = c + 2 * ridx[i];
3451: jj = idx;
3452: vv = v;
3453: for (k = 0; k < cn; k++) {
3454: idx = jj;
3455: v = vv;
3456: sum1 = 0.0;
3457: sum2 = 0.0;
3458: for (j = 0; j < n; j++) {
3459: xb = b + 2 * (*idx++);
3460: x1 = xb[0 + k * bm];
3461: x2 = xb[1 + k * bm];
3462: sum1 += v[0] * x1 + v[2] * x2;
3463: sum2 += v[1] * x1 + v[3] * x2;
3464: v += 4;
3465: }
3466: z[0 + k * cm] = sum1;
3467: z[1 + k * cm] = sum2;
3468: }
3469: if (!usecprow) z += 2;
3470: }
3471: PetscFunctionReturn(PETSC_SUCCESS);
3472: }
3474: static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3475: {
3476: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3477: PetscScalar *z = NULL, sum1, sum2, sum3;
3478: const PetscScalar *xb;
3479: PetscScalar x1, x2, x3;
3480: const MatScalar *v, *vv;
3481: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3482: PetscBool usecprow = a->compressedrow.use;
3484: PetscFunctionBegin;
3485: idx = a->j;
3486: v = a->a;
3487: if (usecprow) {
3488: mbs = a->compressedrow.nrows;
3489: ii = a->compressedrow.i;
3490: ridx = a->compressedrow.rindex;
3491: } else {
3492: mbs = a->mbs;
3493: ii = a->i;
3494: z = c;
3495: }
3497: for (i = 0; i < mbs; i++) {
3498: n = ii[1] - ii[0];
3499: ii++;
3500: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3501: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3502: if (usecprow) z = c + 3 * ridx[i];
3503: jj = idx;
3504: vv = v;
3505: for (k = 0; k < cn; k++) {
3506: idx = jj;
3507: v = vv;
3508: sum1 = 0.0;
3509: sum2 = 0.0;
3510: sum3 = 0.0;
3511: for (j = 0; j < n; j++) {
3512: xb = b + 3 * (*idx++);
3513: x1 = xb[0 + k * bm];
3514: x2 = xb[1 + k * bm];
3515: x3 = xb[2 + k * bm];
3516: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3517: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3518: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3519: v += 9;
3520: }
3521: z[0 + k * cm] = sum1;
3522: z[1 + k * cm] = sum2;
3523: z[2 + k * cm] = sum3;
3524: }
3525: if (!usecprow) z += 3;
3526: }
3527: PetscFunctionReturn(PETSC_SUCCESS);
3528: }
3530: static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3531: {
3532: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3533: PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3534: const PetscScalar *xb;
3535: PetscScalar x1, x2, x3, x4;
3536: const MatScalar *v, *vv;
3537: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3538: PetscBool usecprow = a->compressedrow.use;
3540: PetscFunctionBegin;
3541: idx = a->j;
3542: v = a->a;
3543: if (usecprow) {
3544: mbs = a->compressedrow.nrows;
3545: ii = a->compressedrow.i;
3546: ridx = a->compressedrow.rindex;
3547: } else {
3548: mbs = a->mbs;
3549: ii = a->i;
3550: z = c;
3551: }
3553: for (i = 0; i < mbs; i++) {
3554: n = ii[1] - ii[0];
3555: ii++;
3556: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3557: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3558: if (usecprow) z = c + 4 * ridx[i];
3559: jj = idx;
3560: vv = v;
3561: for (k = 0; k < cn; k++) {
3562: idx = jj;
3563: v = vv;
3564: sum1 = 0.0;
3565: sum2 = 0.0;
3566: sum3 = 0.0;
3567: sum4 = 0.0;
3568: for (j = 0; j < n; j++) {
3569: xb = b + 4 * (*idx++);
3570: x1 = xb[0 + k * bm];
3571: x2 = xb[1 + k * bm];
3572: x3 = xb[2 + k * bm];
3573: x4 = xb[3 + k * bm];
3574: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3575: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3576: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3577: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3578: v += 16;
3579: }
3580: z[0 + k * cm] = sum1;
3581: z[1 + k * cm] = sum2;
3582: z[2 + k * cm] = sum3;
3583: z[3 + k * cm] = sum4;
3584: }
3585: if (!usecprow) z += 4;
3586: }
3587: PetscFunctionReturn(PETSC_SUCCESS);
3588: }
3590: static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3591: {
3592: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3593: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3594: const PetscScalar *xb;
3595: PetscScalar x1, x2, x3, x4, x5;
3596: const MatScalar *v, *vv;
3597: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3598: PetscBool usecprow = a->compressedrow.use;
3600: PetscFunctionBegin;
3601: idx = a->j;
3602: v = a->a;
3603: if (usecprow) {
3604: mbs = a->compressedrow.nrows;
3605: ii = a->compressedrow.i;
3606: ridx = a->compressedrow.rindex;
3607: } else {
3608: mbs = a->mbs;
3609: ii = a->i;
3610: z = c;
3611: }
3613: for (i = 0; i < mbs; i++) {
3614: n = ii[1] - ii[0];
3615: ii++;
3616: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3617: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3618: if (usecprow) z = c + 5 * ridx[i];
3619: jj = idx;
3620: vv = v;
3621: for (k = 0; k < cn; k++) {
3622: idx = jj;
3623: v = vv;
3624: sum1 = 0.0;
3625: sum2 = 0.0;
3626: sum3 = 0.0;
3627: sum4 = 0.0;
3628: sum5 = 0.0;
3629: for (j = 0; j < n; j++) {
3630: xb = b + 5 * (*idx++);
3631: x1 = xb[0 + k * bm];
3632: x2 = xb[1 + k * bm];
3633: x3 = xb[2 + k * bm];
3634: x4 = xb[3 + k * bm];
3635: x5 = xb[4 + k * bm];
3636: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3637: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3638: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3639: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3640: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3641: v += 25;
3642: }
3643: z[0 + k * cm] = sum1;
3644: z[1 + k * cm] = sum2;
3645: z[2 + k * cm] = sum3;
3646: z[3 + k * cm] = sum4;
3647: z[4 + k * cm] = sum5;
3648: }
3649: if (!usecprow) z += 5;
3650: }
3651: PetscFunctionReturn(PETSC_SUCCESS);
3652: }
3654: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3655: {
3656: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3657: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3658: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3659: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3660: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3661: PetscBLASInt bbs, bcn, bbm, bcm;
3662: PetscScalar *z = NULL;
3663: PetscScalar *c, *b;
3664: const MatScalar *v;
3665: const PetscInt *idx, *ii, *ridx = NULL;
3666: PetscScalar _DZero = 0.0, _DOne = 1.0;
3667: PetscBool usecprow = a->compressedrow.use;
3669: PetscFunctionBegin;
3670: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3671: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3672: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3673: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3674: b = bd->v;
3675: if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3676: PetscCall(MatDenseGetArray(C, &c));
3677: switch (bs) {
3678: case 1:
3679: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3680: break;
3681: case 2:
3682: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3683: break;
3684: case 3:
3685: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3686: break;
3687: case 4:
3688: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3689: break;
3690: case 5:
3691: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3692: break;
3693: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3694: PetscCall(PetscBLASIntCast(bs, &bbs));
3695: PetscCall(PetscBLASIntCast(cn, &bcn));
3696: PetscCall(PetscBLASIntCast(bm, &bbm));
3697: PetscCall(PetscBLASIntCast(cm, &bcm));
3698: idx = a->j;
3699: v = a->a;
3700: if (usecprow) {
3701: mbs = a->compressedrow.nrows;
3702: ii = a->compressedrow.i;
3703: ridx = a->compressedrow.rindex;
3704: } else {
3705: mbs = a->mbs;
3706: ii = a->i;
3707: z = c;
3708: }
3709: for (i = 0; i < mbs; i++) {
3710: n = ii[1] - ii[0];
3711: ii++;
3712: if (usecprow) z = c + bs * ridx[i];
3713: if (n) {
3714: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3715: v += bs2;
3716: }
3717: for (j = 1; j < n; j++) {
3718: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3719: v += bs2;
3720: }
3721: if (!usecprow) z += bs;
3722: }
3723: }
3724: PetscCall(MatDenseRestoreArray(C, &c));
3725: PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3726: PetscFunctionReturn(PETSC_SUCCESS);
3727: }