Actual source code: sbaij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
11: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12: const PetscInt *idx;
13: PetscBT table_out, table_in;
15: PetscFunctionBegin;
16: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscCall(PetscBTCreate(mbs, &table_out));
22: PetscCall(PetscMalloc1(mbs + 1, &nidx));
23: PetscCall(PetscBTCreate(mbs, &table_in));
25: for (i = 0; i < is_max; i++) { /* for each is */
26: isz = 0;
27: PetscCall(PetscBTMemzero(mbs, table_out));
29: /* Extract the indices, assume there can be duplicate entries */
30: PetscCall(ISGetIndices(is[i], &idx));
31: PetscCall(ISGetLocalSize(is[i], &n));
33: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34: bcol_max = 0;
35: for (j = 0; j < n; ++j) {
36: brow = idx[j] / bs; /* convert the indices into block indices */
37: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38: if (!PetscBTLookupSet(table_out, brow)) {
39: nidx[isz++] = brow;
40: if (bcol_max < brow) bcol_max = brow;
41: }
42: }
43: PetscCall(ISRestoreIndices(is[i], &idx));
44: PetscCall(ISDestroy(&is[i]));
46: k = 0;
47: for (j = 0; j < ov; j++) { /* for each overlap */
48: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49: PetscCall(PetscBTMemzero(mbs, table_in));
50: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52: n = isz; /* length of the updated is[i] */
53: for (brow = 0; brow < mbs; brow++) {
54: start = ai[brow];
55: end = ai[brow + 1];
56: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l < end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out, bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l < end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in, bcol)) {
71: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
78: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79: } /* for each is */
80: PetscCall(PetscBTDestroy(&table_out));
81: PetscCall(PetscFree(nidx));
82: PetscCall(PetscBTDestroy(&table_in));
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87: Zero some ops' to avoid invalid use */
88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89: {
90: PetscFunctionBegin;
91: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92: Bseq->ops->mult = NULL;
93: Bseq->ops->multadd = NULL;
94: Bseq->ops->multtranspose = NULL;
95: Bseq->ops->multtransposeadd = NULL;
96: Bseq->ops->lufactor = NULL;
97: Bseq->ops->choleskyfactor = NULL;
98: Bseq->ops->lufactorsymbolic = NULL;
99: Bseq->ops->choleskyfactorsymbolic = NULL;
100: Bseq->ops->getinertia = NULL;
101: PetscFunctionReturn(PETSC_SUCCESS);
102: }
104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108: Mat_SeqBAIJ *d = NULL;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
119: PetscCall(ISGetIndices(isrow, &irow));
120: PetscCall(ISGetIndices(iscol, &icol));
121: PetscCall(ISGetLocalSize(isrow, &nrows));
122: PetscCall(ISGetLocalSize(iscol, &ncols));
124: PetscCall(PetscCalloc1(1 + oldcols, &smap));
125: ssmap = smap;
126: PetscCall(PetscMalloc1(1 + nrows, &lens));
127: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128: /* determine lens of each row */
129: for (i = 0; i < nrows; i++) {
130: kstart = ai[irow[i]];
131: kend = kstart + a->ilen[irow[i]];
132: lens[i] = 0;
133: for (k = kstart; k < kend; k++) {
134: if (ssmap[aj[k]]) lens[i]++;
135: }
136: }
137: /* Create and fill new matrix */
138: if (scall == MAT_REUSE_MATRIX) {
139: if (sym) {
140: c = (Mat_SeqSBAIJ *)((*B)->data);
142: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145: PetscCall(PetscArrayzero(c->ilen, c->mbs));
146: } else {
147: d = (Mat_SeqBAIJ *)((*B)->data);
149: PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150: PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152: PetscCall(PetscArrayzero(d->ilen, d->mbs));
153: }
154: C = *B;
155: } else {
156: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158: if (sym) {
159: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161: } else {
162: PetscCall(MatSetType(C, MATSEQBAIJ));
163: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164: }
165: }
166: if (sym) c = (Mat_SeqSBAIJ *)C->data;
167: else d = (Mat_SeqBAIJ *)C->data;
168: for (i = 0; i < nrows; i++) {
169: row = irow[i];
170: kstart = ai[row];
171: kend = kstart + a->ilen[row];
172: if (sym) {
173: mat_i = c->i[i];
174: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
175: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176: mat_ilen = c->ilen + i;
177: } else {
178: mat_i = d->i[i];
179: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
180: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181: mat_ilen = d->ilen + i;
182: }
183: for (k = kstart; k < kend; k++) {
184: if ((tcol = ssmap[a->j[k]])) {
185: *mat_j++ = tcol - 1;
186: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187: mat_a += bs2;
188: (*mat_ilen)++;
189: }
190: }
191: }
192: /* sort */
193: {
194: MatScalar *work;
196: PetscCall(PetscMalloc1(bs2, &work));
197: for (i = 0; i < nrows; i++) {
198: PetscInt ilen;
199: if (sym) {
200: mat_i = c->i[i];
201: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203: ilen = c->ilen[i];
204: } else {
205: mat_i = d->i[i];
206: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208: ilen = d->ilen[i];
209: }
210: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211: }
212: PetscCall(PetscFree(work));
213: }
215: /* Free work space */
216: PetscCall(ISRestoreIndices(iscol, &icol));
217: PetscCall(PetscFree(smap));
218: PetscCall(PetscFree(lens));
219: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
222: PetscCall(ISRestoreIndices(isrow, &irow));
223: *B = C;
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228: {
229: Mat C[2];
230: IS is1, is2, intersect = NULL;
231: PetscInt n1, n2, ni;
232: PetscBool sym = PETSC_TRUE;
234: PetscFunctionBegin;
235: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236: if (isrow == iscol) {
237: is2 = is1;
238: PetscCall(PetscObjectReference((PetscObject)is2));
239: } else {
240: PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241: PetscCall(ISIntersect(is1, is2, &intersect));
242: PetscCall(ISGetLocalSize(intersect, &ni));
243: PetscCall(ISDestroy(&intersect));
244: if (ni == 0) sym = PETSC_FALSE;
245: else if (PetscDefined(USE_DEBUG)) {
246: PetscCall(ISGetLocalSize(is1, &n1));
247: PetscCall(ISGetLocalSize(is2, &n2));
248: PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249: }
250: }
251: if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252: else {
253: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255: PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256: PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257: PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258: if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259: else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260: else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261: PetscCall(MatDestroy(C));
262: PetscCall(MatDestroy(C + 1));
263: }
264: PetscCall(ISDestroy(&is1));
265: PetscCall(ISDestroy(&is2));
267: if (sym && isrow != iscol) {
268: PetscBool isequal;
269: PetscCall(ISEqual(isrow, iscol, &isequal));
270: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271: }
272: PetscFunctionReturn(PETSC_SUCCESS);
273: }
275: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276: {
277: PetscInt i;
279: PetscFunctionBegin;
280: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
282: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283: PetscFunctionReturn(PETSC_SUCCESS);
284: }
286: /* Should check that shapes of vectors and matrices match */
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
290: PetscScalar *z, x1, x2, zero = 0.0;
291: const PetscScalar *x, *xb;
292: const MatScalar *v;
293: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
294: const PetscInt *aj = a->j, *ai = a->i, *ib;
295: PetscInt nonzerorow = 0;
297: PetscFunctionBegin;
298: PetscCall(VecSet(zz, zero));
299: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300: PetscCall(VecGetArrayRead(xx, &x));
301: PetscCall(VecGetArray(zz, &z));
303: v = a->a;
304: xb = x;
306: for (i = 0; i < mbs; i++) {
307: n = ai[1] - ai[0]; /* length of i_th block row of A */
308: x1 = xb[0];
309: x2 = xb[1];
310: ib = aj + *ai;
311: jmin = 0;
312: nonzerorow += (n > 0);
313: if (*ib == i) { /* (diag of A)*x */
314: z[2 * i] += v[0] * x1 + v[2] * x2;
315: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316: v += 4;
317: jmin++;
318: }
319: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321: for (j = jmin; j < n; j++) {
322: /* (strict lower triangular part of A)*x */
323: cval = ib[j] * 2;
324: z[cval] += v[0] * x1 + v[1] * x2;
325: z[cval + 1] += v[2] * x1 + v[3] * x2;
326: /* (strict upper triangular part of A)*x */
327: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329: v += 4;
330: }
331: xb += 2;
332: ai++;
333: }
335: PetscCall(VecRestoreArrayRead(xx, &x));
336: PetscCall(VecRestoreArray(zz, &z));
337: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338: PetscFunctionReturn(PETSC_SUCCESS);
339: }
341: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342: {
343: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
344: PetscScalar *z, x1, x2, x3, zero = 0.0;
345: const PetscScalar *x, *xb;
346: const MatScalar *v;
347: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
348: const PetscInt *aj = a->j, *ai = a->i, *ib;
349: PetscInt nonzerorow = 0;
351: PetscFunctionBegin;
352: PetscCall(VecSet(zz, zero));
353: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354: PetscCall(VecGetArrayRead(xx, &x));
355: PetscCall(VecGetArray(zz, &z));
357: v = a->a;
358: xb = x;
360: for (i = 0; i < mbs; i++) {
361: n = ai[1] - ai[0]; /* length of i_th block row of A */
362: x1 = xb[0];
363: x2 = xb[1];
364: x3 = xb[2];
365: ib = aj + *ai;
366: jmin = 0;
367: nonzerorow += (n > 0);
368: if (*ib == i) { /* (diag of A)*x */
369: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372: v += 9;
373: jmin++;
374: }
375: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377: for (j = jmin; j < n; j++) {
378: /* (strict lower triangular part of A)*x */
379: cval = ib[j] * 3;
380: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383: /* (strict upper triangular part of A)*x */
384: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387: v += 9;
388: }
389: xb += 3;
390: ai++;
391: }
393: PetscCall(VecRestoreArrayRead(xx, &x));
394: PetscCall(VecRestoreArray(zz, &z));
395: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400: {
401: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
402: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
403: const PetscScalar *x, *xb;
404: const MatScalar *v;
405: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
406: const PetscInt *aj = a->j, *ai = a->i, *ib;
407: PetscInt nonzerorow = 0;
409: PetscFunctionBegin;
410: PetscCall(VecSet(zz, zero));
411: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412: PetscCall(VecGetArrayRead(xx, &x));
413: PetscCall(VecGetArray(zz, &z));
415: v = a->a;
416: xb = x;
418: for (i = 0; i < mbs; i++) {
419: n = ai[1] - ai[0]; /* length of i_th block row of A */
420: x1 = xb[0];
421: x2 = xb[1];
422: x3 = xb[2];
423: x4 = xb[3];
424: ib = aj + *ai;
425: jmin = 0;
426: nonzerorow += (n > 0);
427: if (*ib == i) { /* (diag of A)*x */
428: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432: v += 16;
433: jmin++;
434: }
435: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
436: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437: for (j = jmin; j < n; j++) {
438: /* (strict lower triangular part of A)*x */
439: cval = ib[j] * 4;
440: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444: /* (strict upper triangular part of A)*x */
445: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449: v += 16;
450: }
451: xb += 4;
452: ai++;
453: }
455: PetscCall(VecRestoreArrayRead(xx, &x));
456: PetscCall(VecRestoreArray(zz, &z));
457: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458: PetscFunctionReturn(PETSC_SUCCESS);
459: }
461: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462: {
463: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
464: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
465: const PetscScalar *x, *xb;
466: const MatScalar *v;
467: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
468: const PetscInt *aj = a->j, *ai = a->i, *ib;
469: PetscInt nonzerorow = 0;
471: PetscFunctionBegin;
472: PetscCall(VecSet(zz, zero));
473: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474: PetscCall(VecGetArrayRead(xx, &x));
475: PetscCall(VecGetArray(zz, &z));
477: v = a->a;
478: xb = x;
480: for (i = 0; i < mbs; i++) {
481: n = ai[1] - ai[0]; /* length of i_th block row of A */
482: x1 = xb[0];
483: x2 = xb[1];
484: x3 = xb[2];
485: x4 = xb[3];
486: x5 = xb[4];
487: ib = aj + *ai;
488: jmin = 0;
489: nonzerorow += (n > 0);
490: if (*ib == i) { /* (diag of A)*x */
491: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496: v += 25;
497: jmin++;
498: }
499: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
500: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501: for (j = jmin; j < n; j++) {
502: /* (strict lower triangular part of A)*x */
503: cval = ib[j] * 5;
504: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509: /* (strict upper triangular part of A)*x */
510: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515: v += 25;
516: }
517: xb += 5;
518: ai++;
519: }
521: PetscCall(VecRestoreArrayRead(xx, &x));
522: PetscCall(VecRestoreArray(zz, &z));
523: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524: PetscFunctionReturn(PETSC_SUCCESS);
525: }
527: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528: {
529: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
530: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531: const PetscScalar *x, *xb;
532: const MatScalar *v;
533: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
534: const PetscInt *aj = a->j, *ai = a->i, *ib;
535: PetscInt nonzerorow = 0;
537: PetscFunctionBegin;
538: PetscCall(VecSet(zz, zero));
539: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540: PetscCall(VecGetArrayRead(xx, &x));
541: PetscCall(VecGetArray(zz, &z));
543: v = a->a;
544: xb = x;
546: for (i = 0; i < mbs; i++) {
547: n = ai[1] - ai[0]; /* length of i_th block row of A */
548: x1 = xb[0];
549: x2 = xb[1];
550: x3 = xb[2];
551: x4 = xb[3];
552: x5 = xb[4];
553: x6 = xb[5];
554: ib = aj + *ai;
555: jmin = 0;
556: nonzerorow += (n > 0);
557: if (*ib == i) { /* (diag of A)*x */
558: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564: v += 36;
565: jmin++;
566: }
567: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
568: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569: for (j = jmin; j < n; j++) {
570: /* (strict lower triangular part of A)*x */
571: cval = ib[j] * 6;
572: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578: /* (strict upper triangular part of A)*x */
579: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585: v += 36;
586: }
587: xb += 6;
588: ai++;
589: }
591: PetscCall(VecRestoreArrayRead(xx, &x));
592: PetscCall(VecRestoreArray(zz, &z));
593: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594: PetscFunctionReturn(PETSC_SUCCESS);
595: }
597: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598: {
599: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
600: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601: const PetscScalar *x, *xb;
602: const MatScalar *v;
603: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
604: const PetscInt *aj = a->j, *ai = a->i, *ib;
605: PetscInt nonzerorow = 0;
607: PetscFunctionBegin;
608: PetscCall(VecSet(zz, zero));
609: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610: PetscCall(VecGetArrayRead(xx, &x));
611: PetscCall(VecGetArray(zz, &z));
613: v = a->a;
614: xb = x;
616: for (i = 0; i < mbs; i++) {
617: n = ai[1] - ai[0]; /* length of i_th block row of A */
618: x1 = xb[0];
619: x2 = xb[1];
620: x3 = xb[2];
621: x4 = xb[3];
622: x5 = xb[4];
623: x6 = xb[5];
624: x7 = xb[6];
625: ib = aj + *ai;
626: jmin = 0;
627: nonzerorow += (n > 0);
628: if (*ib == i) { /* (diag of A)*x */
629: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636: v += 49;
637: jmin++;
638: }
639: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
640: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641: for (j = jmin; j < n; j++) {
642: /* (strict lower triangular part of A)*x */
643: cval = ib[j] * 7;
644: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651: /* (strict upper triangular part of A)*x */
652: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659: v += 49;
660: }
661: xb += 7;
662: ai++;
663: }
664: PetscCall(VecRestoreArrayRead(xx, &x));
665: PetscCall(VecRestoreArray(zz, &z));
666: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: /*
671: This will not work with MatScalar == float because it calls the BLAS
672: */
673: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674: {
675: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
676: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677: const PetscScalar *x, *x_ptr, *xb;
678: const MatScalar *v;
679: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680: const PetscInt *idx, *aj, *ii;
681: PetscInt nonzerorow = 0;
683: PetscFunctionBegin;
684: PetscCall(VecSet(zz, zero));
685: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686: PetscCall(VecGetArrayRead(xx, &x));
687: PetscCall(VecGetArray(zz, &z));
689: x_ptr = x;
690: z_ptr = z;
692: aj = a->j;
693: v = a->a;
694: ii = a->i;
696: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697: work = a->mult_work;
699: for (i = 0; i < mbs; i++) {
700: n = ii[1] - ii[0];
701: ncols = n * bs;
702: workt = work;
703: idx = aj + ii[0];
704: nonzerorow += (n > 0);
706: /* upper triangular part */
707: for (j = 0; j < n; j++) {
708: xb = x_ptr + bs * (*idx++);
709: for (k = 0; k < bs; k++) workt[k] = xb[k];
710: workt += bs;
711: }
712: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
715: /* strict lower triangular part */
716: idx = aj + ii[0];
717: if (n && *idx == i) {
718: ncols -= bs;
719: v += bs2;
720: idx++;
721: n--;
722: }
724: if (ncols > 0) {
725: workt = work;
726: PetscCall(PetscArrayzero(workt, ncols));
727: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728: for (j = 0; j < n; j++) {
729: zb = z_ptr + bs * (*idx++);
730: for (k = 0; k < bs; k++) zb[k] += workt[k];
731: workt += bs;
732: }
733: }
734: x += bs;
735: v += n * bs2;
736: z += bs;
737: ii++;
738: }
740: PetscCall(VecRestoreArrayRead(xx, &x));
741: PetscCall(VecRestoreArray(zz, &z));
742: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743: PetscFunctionReturn(PETSC_SUCCESS);
744: }
746: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747: {
748: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
749: PetscScalar *z, x1;
750: const PetscScalar *x, *xb;
751: const MatScalar *v;
752: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
753: const PetscInt *aj = a->j, *ai = a->i, *ib;
754: PetscInt nonzerorow = 0;
755: #if defined(PETSC_USE_COMPLEX)
756: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
757: #else
758: const int aconj = 0;
759: #endif
761: PetscFunctionBegin;
762: PetscCall(VecCopy(yy, zz));
763: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
764: PetscCall(VecGetArrayRead(xx, &x));
765: PetscCall(VecGetArray(zz, &z));
766: v = a->a;
767: xb = x;
769: for (i = 0; i < mbs; i++) {
770: n = ai[1] - ai[0]; /* length of i_th row of A */
771: x1 = xb[0];
772: ib = aj + *ai;
773: jmin = 0;
774: nonzerorow += (n > 0);
775: if (n && *ib == i) { /* (diag of A)*x */
776: z[i] += *v++ * x[*ib++];
777: jmin++;
778: }
779: if (aconj) {
780: for (j = jmin; j < n; j++) {
781: cval = *ib;
782: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
783: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
784: }
785: } else {
786: for (j = jmin; j < n; j++) {
787: cval = *ib;
788: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
789: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
790: }
791: }
792: xb++;
793: ai++;
794: }
796: PetscCall(VecRestoreArrayRead(xx, &x));
797: PetscCall(VecRestoreArray(zz, &z));
799: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
800: PetscFunctionReturn(PETSC_SUCCESS);
801: }
803: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
804: {
805: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
806: PetscScalar *z, x1, x2;
807: const PetscScalar *x, *xb;
808: const MatScalar *v;
809: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
810: const PetscInt *aj = a->j, *ai = a->i, *ib;
811: PetscInt nonzerorow = 0;
813: PetscFunctionBegin;
814: PetscCall(VecCopy(yy, zz));
815: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816: PetscCall(VecGetArrayRead(xx, &x));
817: PetscCall(VecGetArray(zz, &z));
819: v = a->a;
820: xb = x;
822: for (i = 0; i < mbs; i++) {
823: n = ai[1] - ai[0]; /* length of i_th block row of A */
824: x1 = xb[0];
825: x2 = xb[1];
826: ib = aj + *ai;
827: jmin = 0;
828: nonzerorow += (n > 0);
829: if (n && *ib == i) { /* (diag of A)*x */
830: z[2 * i] += v[0] * x1 + v[2] * x2;
831: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
832: v += 4;
833: jmin++;
834: }
835: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
836: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
837: for (j = jmin; j < n; j++) {
838: /* (strict lower triangular part of A)*x */
839: cval = ib[j] * 2;
840: z[cval] += v[0] * x1 + v[1] * x2;
841: z[cval + 1] += v[2] * x1 + v[3] * x2;
842: /* (strict upper triangular part of A)*x */
843: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
844: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
845: v += 4;
846: }
847: xb += 2;
848: ai++;
849: }
850: PetscCall(VecRestoreArrayRead(xx, &x));
851: PetscCall(VecRestoreArray(zz, &z));
853: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
854: PetscFunctionReturn(PETSC_SUCCESS);
855: }
857: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
858: {
859: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
860: PetscScalar *z, x1, x2, x3;
861: const PetscScalar *x, *xb;
862: const MatScalar *v;
863: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
864: const PetscInt *aj = a->j, *ai = a->i, *ib;
865: PetscInt nonzerorow = 0;
867: PetscFunctionBegin;
868: PetscCall(VecCopy(yy, zz));
869: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
870: PetscCall(VecGetArrayRead(xx, &x));
871: PetscCall(VecGetArray(zz, &z));
873: v = a->a;
874: xb = x;
876: for (i = 0; i < mbs; i++) {
877: n = ai[1] - ai[0]; /* length of i_th block row of A */
878: x1 = xb[0];
879: x2 = xb[1];
880: x3 = xb[2];
881: ib = aj + *ai;
882: jmin = 0;
883: nonzerorow += (n > 0);
884: if (n && *ib == i) { /* (diag of A)*x */
885: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
886: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
887: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
888: v += 9;
889: jmin++;
890: }
891: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
892: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
893: for (j = jmin; j < n; j++) {
894: /* (strict lower triangular part of A)*x */
895: cval = ib[j] * 3;
896: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
897: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
898: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
899: /* (strict upper triangular part of A)*x */
900: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
901: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
902: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
903: v += 9;
904: }
905: xb += 3;
906: ai++;
907: }
909: PetscCall(VecRestoreArrayRead(xx, &x));
910: PetscCall(VecRestoreArray(zz, &z));
912: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
913: PetscFunctionReturn(PETSC_SUCCESS);
914: }
916: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
917: {
918: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
919: PetscScalar *z, x1, x2, x3, x4;
920: const PetscScalar *x, *xb;
921: const MatScalar *v;
922: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
923: const PetscInt *aj = a->j, *ai = a->i, *ib;
924: PetscInt nonzerorow = 0;
926: PetscFunctionBegin;
927: PetscCall(VecCopy(yy, zz));
928: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
929: PetscCall(VecGetArrayRead(xx, &x));
930: PetscCall(VecGetArray(zz, &z));
932: v = a->a;
933: xb = x;
935: for (i = 0; i < mbs; i++) {
936: n = ai[1] - ai[0]; /* length of i_th block row of A */
937: x1 = xb[0];
938: x2 = xb[1];
939: x3 = xb[2];
940: x4 = xb[3];
941: ib = aj + *ai;
942: jmin = 0;
943: nonzerorow += (n > 0);
944: if (n && *ib == i) { /* (diag of A)*x */
945: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
946: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
947: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
948: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
949: v += 16;
950: jmin++;
951: }
952: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
953: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
954: for (j = jmin; j < n; j++) {
955: /* (strict lower triangular part of A)*x */
956: cval = ib[j] * 4;
957: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
958: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
959: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
960: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
961: /* (strict upper triangular part of A)*x */
962: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
963: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
964: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
965: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
966: v += 16;
967: }
968: xb += 4;
969: ai++;
970: }
972: PetscCall(VecRestoreArrayRead(xx, &x));
973: PetscCall(VecRestoreArray(zz, &z));
975: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
976: PetscFunctionReturn(PETSC_SUCCESS);
977: }
979: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
980: {
981: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
982: PetscScalar *z, x1, x2, x3, x4, x5;
983: const PetscScalar *x, *xb;
984: const MatScalar *v;
985: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
986: const PetscInt *aj = a->j, *ai = a->i, *ib;
987: PetscInt nonzerorow = 0;
989: PetscFunctionBegin;
990: PetscCall(VecCopy(yy, zz));
991: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
992: PetscCall(VecGetArrayRead(xx, &x));
993: PetscCall(VecGetArray(zz, &z));
995: v = a->a;
996: xb = x;
998: for (i = 0; i < mbs; i++) {
999: n = ai[1] - ai[0]; /* length of i_th block row of A */
1000: x1 = xb[0];
1001: x2 = xb[1];
1002: x3 = xb[2];
1003: x4 = xb[3];
1004: x5 = xb[4];
1005: ib = aj + *ai;
1006: jmin = 0;
1007: nonzerorow += (n > 0);
1008: if (n && *ib == i) { /* (diag of A)*x */
1009: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1010: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1011: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1012: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1013: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1014: v += 25;
1015: jmin++;
1016: }
1017: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019: for (j = jmin; j < n; j++) {
1020: /* (strict lower triangular part of A)*x */
1021: cval = ib[j] * 5;
1022: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1023: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1024: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1025: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1026: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1027: /* (strict upper triangular part of A)*x */
1028: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1029: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1030: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1031: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1032: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1033: v += 25;
1034: }
1035: xb += 5;
1036: ai++;
1037: }
1039: PetscCall(VecRestoreArrayRead(xx, &x));
1040: PetscCall(VecRestoreArray(zz, &z));
1042: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1047: {
1048: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1049: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1050: const PetscScalar *x, *xb;
1051: const MatScalar *v;
1052: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1053: const PetscInt *aj = a->j, *ai = a->i, *ib;
1054: PetscInt nonzerorow = 0;
1056: PetscFunctionBegin;
1057: PetscCall(VecCopy(yy, zz));
1058: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1059: PetscCall(VecGetArrayRead(xx, &x));
1060: PetscCall(VecGetArray(zz, &z));
1062: v = a->a;
1063: xb = x;
1065: for (i = 0; i < mbs; i++) {
1066: n = ai[1] - ai[0]; /* length of i_th block row of A */
1067: x1 = xb[0];
1068: x2 = xb[1];
1069: x3 = xb[2];
1070: x4 = xb[3];
1071: x5 = xb[4];
1072: x6 = xb[5];
1073: ib = aj + *ai;
1074: jmin = 0;
1075: nonzerorow += (n > 0);
1076: if (n && *ib == i) { /* (diag of A)*x */
1077: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1078: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1079: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1080: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1081: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1082: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1083: v += 36;
1084: jmin++;
1085: }
1086: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1087: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088: for (j = jmin; j < n; j++) {
1089: /* (strict lower triangular part of A)*x */
1090: cval = ib[j] * 6;
1091: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1092: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1093: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1094: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1095: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1096: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1097: /* (strict upper triangular part of A)*x */
1098: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1099: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1100: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1101: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1102: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1103: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1104: v += 36;
1105: }
1106: xb += 6;
1107: ai++;
1108: }
1110: PetscCall(VecRestoreArrayRead(xx, &x));
1111: PetscCall(VecRestoreArray(zz, &z));
1113: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1114: PetscFunctionReturn(PETSC_SUCCESS);
1115: }
1117: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1118: {
1119: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1120: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1121: const PetscScalar *x, *xb;
1122: const MatScalar *v;
1123: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1124: const PetscInt *aj = a->j, *ai = a->i, *ib;
1125: PetscInt nonzerorow = 0;
1127: PetscFunctionBegin;
1128: PetscCall(VecCopy(yy, zz));
1129: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1130: PetscCall(VecGetArrayRead(xx, &x));
1131: PetscCall(VecGetArray(zz, &z));
1133: v = a->a;
1134: xb = x;
1136: for (i = 0; i < mbs; i++) {
1137: n = ai[1] - ai[0]; /* length of i_th block row of A */
1138: x1 = xb[0];
1139: x2 = xb[1];
1140: x3 = xb[2];
1141: x4 = xb[3];
1142: x5 = xb[4];
1143: x6 = xb[5];
1144: x7 = xb[6];
1145: ib = aj + *ai;
1146: jmin = 0;
1147: nonzerorow += (n > 0);
1148: if (n && *ib == i) { /* (diag of A)*x */
1149: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1150: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1151: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1152: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1153: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1154: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1155: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1156: v += 49;
1157: jmin++;
1158: }
1159: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1160: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1161: for (j = jmin; j < n; j++) {
1162: /* (strict lower triangular part of A)*x */
1163: cval = ib[j] * 7;
1164: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1165: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1166: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1167: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1168: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1169: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1170: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1171: /* (strict upper triangular part of A)*x */
1172: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1173: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1174: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1175: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1176: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1177: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1178: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1179: v += 49;
1180: }
1181: xb += 7;
1182: ai++;
1183: }
1185: PetscCall(VecRestoreArrayRead(xx, &x));
1186: PetscCall(VecRestoreArray(zz, &z));
1188: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1189: PetscFunctionReturn(PETSC_SUCCESS);
1190: }
1192: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1193: {
1194: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1195: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1196: const PetscScalar *x, *x_ptr, *xb;
1197: const MatScalar *v;
1198: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1199: const PetscInt *idx, *aj, *ii;
1200: PetscInt nonzerorow = 0;
1202: PetscFunctionBegin;
1203: PetscCall(VecCopy(yy, zz));
1204: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1205: PetscCall(VecGetArrayRead(xx, &x));
1206: x_ptr = x;
1207: PetscCall(VecGetArray(zz, &z));
1208: z_ptr = z;
1210: aj = a->j;
1211: v = a->a;
1212: ii = a->i;
1214: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1215: work = a->mult_work;
1217: for (i = 0; i < mbs; i++) {
1218: n = ii[1] - ii[0];
1219: ncols = n * bs;
1220: workt = work;
1221: idx = aj + ii[0];
1222: nonzerorow += (n > 0);
1224: /* upper triangular part */
1225: for (j = 0; j < n; j++) {
1226: xb = x_ptr + bs * (*idx++);
1227: for (k = 0; k < bs; k++) workt[k] = xb[k];
1228: workt += bs;
1229: }
1230: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1231: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1233: /* strict lower triangular part */
1234: idx = aj + ii[0];
1235: if (n && *idx == i) {
1236: ncols -= bs;
1237: v += bs2;
1238: idx++;
1239: n--;
1240: }
1241: if (ncols > 0) {
1242: workt = work;
1243: PetscCall(PetscArrayzero(workt, ncols));
1244: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1245: for (j = 0; j < n; j++) {
1246: zb = z_ptr + bs * (*idx++);
1247: for (k = 0; k < bs; k++) zb[k] += workt[k];
1248: workt += bs;
1249: }
1250: }
1252: x += bs;
1253: v += n * bs2;
1254: z += bs;
1255: ii++;
1256: }
1258: PetscCall(VecRestoreArrayRead(xx, &x));
1259: PetscCall(VecRestoreArray(zz, &z));
1261: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1262: PetscFunctionReturn(PETSC_SUCCESS);
1263: }
1265: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1266: {
1267: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1268: PetscScalar oalpha = alpha;
1269: PetscBLASInt one = 1, totalnz;
1271: PetscFunctionBegin;
1272: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1273: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1274: PetscCall(PetscLogFlops(totalnz));
1275: PetscFunctionReturn(PETSC_SUCCESS);
1276: }
1278: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1279: {
1280: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1281: const MatScalar *v = a->a;
1282: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1283: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1284: const PetscInt *aj = a->j, *col;
1286: PetscFunctionBegin;
1287: if (!a->nz) {
1288: *norm = 0.0;
1289: PetscFunctionReturn(PETSC_SUCCESS);
1290: }
1291: if (type == NORM_FROBENIUS) {
1292: for (k = 0; k < mbs; k++) {
1293: jmin = a->i[k];
1294: jmax = a->i[k + 1];
1295: col = aj + jmin;
1296: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1297: for (i = 0; i < bs2; i++) {
1298: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1299: v++;
1300: }
1301: jmin++;
1302: }
1303: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1304: for (i = 0; i < bs2; i++) {
1305: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1306: v++;
1307: }
1308: }
1309: }
1310: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1311: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1312: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1313: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1314: for (i = 0; i < mbs; i++) jl[i] = mbs;
1315: il[0] = 0;
1317: *norm = 0.0;
1318: for (k = 0; k < mbs; k++) { /* k_th block row */
1319: for (j = 0; j < bs; j++) sum[j] = 0.0;
1320: /*-- col sum --*/
1321: i = jl[k]; /* first |A(i,k)| to be added */
1322: /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1323: at step k */
1324: while (i < mbs) {
1325: nexti = jl[i]; /* next block row to be added */
1326: ik = il[i]; /* block index of A(i,k) in the array a */
1327: for (j = 0; j < bs; j++) {
1328: v = a->a + ik * bs2 + j * bs;
1329: for (k1 = 0; k1 < bs; k1++) {
1330: sum[j] += PetscAbsScalar(*v);
1331: v++;
1332: }
1333: }
1334: /* update il, jl */
1335: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1336: jmax = a->i[i + 1];
1337: if (jmin < jmax) {
1338: il[i] = jmin;
1339: j = a->j[jmin];
1340: jl[i] = jl[j];
1341: jl[j] = i;
1342: }
1343: i = nexti;
1344: }
1345: /*-- row sum --*/
1346: jmin = a->i[k];
1347: jmax = a->i[k + 1];
1348: for (i = jmin; i < jmax; i++) {
1349: for (j = 0; j < bs; j++) {
1350: v = a->a + i * bs2 + j;
1351: for (k1 = 0; k1 < bs; k1++) {
1352: sum[j] += PetscAbsScalar(*v);
1353: v += bs;
1354: }
1355: }
1356: }
1357: /* add k_th block row to il, jl */
1358: col = aj + jmin;
1359: if (jmax - jmin > 0 && *col == k) jmin++;
1360: if (jmin < jmax) {
1361: il[k] = jmin;
1362: j = a->j[jmin];
1363: jl[k] = jl[j];
1364: jl[j] = k;
1365: }
1366: for (j = 0; j < bs; j++) {
1367: if (sum[j] > *norm) *norm = sum[j];
1368: }
1369: }
1370: PetscCall(PetscFree3(sum, il, jl));
1371: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1372: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1373: PetscFunctionReturn(PETSC_SUCCESS);
1374: }
1376: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1377: {
1378: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1380: PetscFunctionBegin;
1381: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1382: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1383: *flg = PETSC_FALSE;
1384: PetscFunctionReturn(PETSC_SUCCESS);
1385: }
1387: /* if the a->i are the same */
1388: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1389: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1391: /* if a->j are the same */
1392: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1393: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1395: /* if a->a are the same */
1396: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1397: PetscFunctionReturn(PETSC_SUCCESS);
1398: }
1400: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1401: {
1402: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1403: PetscInt i, j, k, row, bs, ambs, bs2;
1404: const PetscInt *ai, *aj;
1405: PetscScalar *x, zero = 0.0;
1406: const MatScalar *aa, *aa_j;
1408: PetscFunctionBegin;
1409: bs = A->rmap->bs;
1410: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1412: aa = a->a;
1413: ambs = a->mbs;
1415: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1416: PetscInt *diag = a->diag;
1417: aa = a->a;
1418: ambs = a->mbs;
1419: PetscCall(VecGetArray(v, &x));
1420: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1421: PetscCall(VecRestoreArray(v, &x));
1422: PetscFunctionReturn(PETSC_SUCCESS);
1423: }
1425: ai = a->i;
1426: aj = a->j;
1427: bs2 = a->bs2;
1428: PetscCall(VecSet(v, zero));
1429: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1430: PetscCall(VecGetArray(v, &x));
1431: for (i = 0; i < ambs; i++) {
1432: j = ai[i];
1433: if (aj[j] == i) { /* if this is a diagonal element */
1434: row = i * bs;
1435: aa_j = aa + j * bs2;
1436: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1437: }
1438: }
1439: PetscCall(VecRestoreArray(v, &x));
1440: PetscFunctionReturn(PETSC_SUCCESS);
1441: }
1443: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1444: {
1445: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1446: PetscScalar x;
1447: const PetscScalar *l, *li, *ri;
1448: MatScalar *aa, *v;
1449: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1450: const PetscInt *ai, *aj;
1451: PetscBool flg;
1453: PetscFunctionBegin;
1454: if (ll != rr) {
1455: PetscCall(VecEqual(ll, rr, &flg));
1456: PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1457: }
1458: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1459: ai = a->i;
1460: aj = a->j;
1461: aa = a->a;
1462: m = A->rmap->N;
1463: bs = A->rmap->bs;
1464: mbs = a->mbs;
1465: bs2 = a->bs2;
1467: PetscCall(VecGetArrayRead(ll, &l));
1468: PetscCall(VecGetLocalSize(ll, &lm));
1469: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1470: for (i = 0; i < mbs; i++) { /* for each block row */
1471: M = ai[i + 1] - ai[i];
1472: li = l + i * bs;
1473: v = aa + bs2 * ai[i];
1474: for (j = 0; j < M; j++) { /* for each block */
1475: ri = l + bs * aj[ai[i] + j];
1476: for (k = 0; k < bs; k++) {
1477: x = ri[k];
1478: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1479: }
1480: }
1481: }
1482: PetscCall(VecRestoreArrayRead(ll, &l));
1483: PetscCall(PetscLogFlops(2.0 * a->nz));
1484: PetscFunctionReturn(PETSC_SUCCESS);
1485: }
1487: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1488: {
1489: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1491: PetscFunctionBegin;
1492: info->block_size = a->bs2;
1493: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1494: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1495: info->nz_unneeded = info->nz_allocated - info->nz_used;
1496: info->assemblies = A->num_ass;
1497: info->mallocs = A->info.mallocs;
1498: info->memory = 0; /* REVIEW ME */
1499: if (A->factortype) {
1500: info->fill_ratio_given = A->info.fill_ratio_given;
1501: info->fill_ratio_needed = A->info.fill_ratio_needed;
1502: info->factor_mallocs = A->info.factor_mallocs;
1503: } else {
1504: info->fill_ratio_given = 0;
1505: info->fill_ratio_needed = 0;
1506: info->factor_mallocs = 0;
1507: }
1508: PetscFunctionReturn(PETSC_SUCCESS);
1509: }
1511: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1512: {
1513: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1515: PetscFunctionBegin;
1516: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1517: PetscFunctionReturn(PETSC_SUCCESS);
1518: }
1520: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1521: {
1522: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1523: PetscInt i, j, n, row, col, bs, mbs;
1524: const PetscInt *ai, *aj;
1525: PetscReal atmp;
1526: const MatScalar *aa;
1527: PetscScalar *x;
1528: PetscInt ncols, brow, bcol, krow, kcol;
1530: PetscFunctionBegin;
1531: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1532: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1533: bs = A->rmap->bs;
1534: aa = a->a;
1535: ai = a->i;
1536: aj = a->j;
1537: mbs = a->mbs;
1539: PetscCall(VecSet(v, 0.0));
1540: PetscCall(VecGetArray(v, &x));
1541: PetscCall(VecGetLocalSize(v, &n));
1542: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1543: for (i = 0; i < mbs; i++) {
1544: ncols = ai[1] - ai[0];
1545: ai++;
1546: brow = bs * i;
1547: for (j = 0; j < ncols; j++) {
1548: bcol = bs * (*aj);
1549: for (kcol = 0; kcol < bs; kcol++) {
1550: col = bcol + kcol; /* col index */
1551: for (krow = 0; krow < bs; krow++) {
1552: atmp = PetscAbsScalar(*aa);
1553: aa++;
1554: row = brow + krow; /* row index */
1555: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1556: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1557: }
1558: }
1559: aj++;
1560: }
1561: }
1562: PetscCall(VecRestoreArray(v, &x));
1563: PetscFunctionReturn(PETSC_SUCCESS);
1564: }
1566: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1567: {
1568: PetscFunctionBegin;
1569: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1570: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1571: PetscFunctionReturn(PETSC_SUCCESS);
1572: }
1574: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1575: {
1576: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1577: PetscScalar *z = c;
1578: const PetscScalar *xb;
1579: PetscScalar x1;
1580: const MatScalar *v = a->a, *vv;
1581: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1582: #if defined(PETSC_USE_COMPLEX)
1583: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1584: #else
1585: const int aconj = 0;
1586: #endif
1588: PetscFunctionBegin;
1589: for (i = 0; i < mbs; i++) {
1590: n = ii[1] - ii[0];
1591: ii++;
1592: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1593: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1594: jj = idx;
1595: vv = v;
1596: for (k = 0; k < cn; k++) {
1597: idx = jj;
1598: v = vv;
1599: for (j = 0; j < n; j++) {
1600: xb = b + (*idx);
1601: x1 = xb[0 + k * bm];
1602: z[0 + k * cm] += v[0] * x1;
1603: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1604: v += 1;
1605: ++idx;
1606: }
1607: }
1608: z += 1;
1609: }
1610: PetscFunctionReturn(PETSC_SUCCESS);
1611: }
1613: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1614: {
1615: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1616: PetscScalar *z = c;
1617: const PetscScalar *xb;
1618: PetscScalar x1, x2;
1619: const MatScalar *v = a->a, *vv;
1620: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1622: PetscFunctionBegin;
1623: for (i = 0; i < mbs; i++) {
1624: n = ii[1] - ii[0];
1625: ii++;
1626: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1627: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1628: jj = idx;
1629: vv = v;
1630: for (k = 0; k < cn; k++) {
1631: idx = jj;
1632: v = vv;
1633: for (j = 0; j < n; j++) {
1634: xb = b + 2 * (*idx);
1635: x1 = xb[0 + k * bm];
1636: x2 = xb[1 + k * bm];
1637: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1638: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1639: if (*idx != i) {
1640: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1641: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1642: }
1643: v += 4;
1644: ++idx;
1645: }
1646: }
1647: z += 2;
1648: }
1649: PetscFunctionReturn(PETSC_SUCCESS);
1650: }
1652: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1653: {
1654: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1655: PetscScalar *z = c;
1656: const PetscScalar *xb;
1657: PetscScalar x1, x2, x3;
1658: const MatScalar *v = a->a, *vv;
1659: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1661: PetscFunctionBegin;
1662: for (i = 0; i < mbs; i++) {
1663: n = ii[1] - ii[0];
1664: ii++;
1665: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1666: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1667: jj = idx;
1668: vv = v;
1669: for (k = 0; k < cn; k++) {
1670: idx = jj;
1671: v = vv;
1672: for (j = 0; j < n; j++) {
1673: xb = b + 3 * (*idx);
1674: x1 = xb[0 + k * bm];
1675: x2 = xb[1 + k * bm];
1676: x3 = xb[2 + k * bm];
1677: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1678: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1679: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1680: if (*idx != i) {
1681: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1682: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1683: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1684: }
1685: v += 9;
1686: ++idx;
1687: }
1688: }
1689: z += 3;
1690: }
1691: PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1694: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1695: {
1696: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1697: PetscScalar *z = c;
1698: const PetscScalar *xb;
1699: PetscScalar x1, x2, x3, x4;
1700: const MatScalar *v = a->a, *vv;
1701: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1703: PetscFunctionBegin;
1704: for (i = 0; i < mbs; i++) {
1705: n = ii[1] - ii[0];
1706: ii++;
1707: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1708: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1709: jj = idx;
1710: vv = v;
1711: for (k = 0; k < cn; k++) {
1712: idx = jj;
1713: v = vv;
1714: for (j = 0; j < n; j++) {
1715: xb = b + 4 * (*idx);
1716: x1 = xb[0 + k * bm];
1717: x2 = xb[1 + k * bm];
1718: x3 = xb[2 + k * bm];
1719: x4 = xb[3 + k * bm];
1720: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1721: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1722: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1723: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1724: if (*idx != i) {
1725: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1726: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1727: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1728: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1729: }
1730: v += 16;
1731: ++idx;
1732: }
1733: }
1734: z += 4;
1735: }
1736: PetscFunctionReturn(PETSC_SUCCESS);
1737: }
1739: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1740: {
1741: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1742: PetscScalar *z = c;
1743: const PetscScalar *xb;
1744: PetscScalar x1, x2, x3, x4, x5;
1745: const MatScalar *v = a->a, *vv;
1746: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1748: PetscFunctionBegin;
1749: for (i = 0; i < mbs; i++) {
1750: n = ii[1] - ii[0];
1751: ii++;
1752: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1753: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1754: jj = idx;
1755: vv = v;
1756: for (k = 0; k < cn; k++) {
1757: idx = jj;
1758: v = vv;
1759: for (j = 0; j < n; j++) {
1760: xb = b + 5 * (*idx);
1761: x1 = xb[0 + k * bm];
1762: x2 = xb[1 + k * bm];
1763: x3 = xb[2 + k * bm];
1764: x4 = xb[3 + k * bm];
1765: x5 = xb[4 + k * cm];
1766: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1767: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1768: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1769: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1770: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1771: if (*idx != i) {
1772: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1773: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1774: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1775: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1776: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1777: }
1778: v += 25;
1779: ++idx;
1780: }
1781: }
1782: z += 5;
1783: }
1784: PetscFunctionReturn(PETSC_SUCCESS);
1785: }
1787: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1788: {
1789: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1790: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1791: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1792: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1793: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1794: PetscBLASInt bbs, bcn, bbm, bcm;
1795: PetscScalar *z = NULL;
1796: PetscScalar *c, *b;
1797: const MatScalar *v;
1798: const PetscInt *idx, *ii;
1799: PetscScalar _DOne = 1.0;
1801: PetscFunctionBegin;
1802: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1803: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1804: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1805: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1806: b = bd->v;
1807: PetscCall(MatZeroEntries(C));
1808: PetscCall(MatDenseGetArray(C, &c));
1809: switch (bs) {
1810: case 1:
1811: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1812: break;
1813: case 2:
1814: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1815: break;
1816: case 3:
1817: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1818: break;
1819: case 4:
1820: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1821: break;
1822: case 5:
1823: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1824: break;
1825: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1826: PetscCall(PetscBLASIntCast(bs, &bbs));
1827: PetscCall(PetscBLASIntCast(cn, &bcn));
1828: PetscCall(PetscBLASIntCast(bm, &bbm));
1829: PetscCall(PetscBLASIntCast(cm, &bcm));
1830: idx = a->j;
1831: v = a->a;
1832: mbs = a->mbs;
1833: ii = a->i;
1834: z = c;
1835: for (i = 0; i < mbs; i++) {
1836: n = ii[1] - ii[0];
1837: ii++;
1838: for (j = 0; j < n; j++) {
1839: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1840: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1841: v += bs2;
1842: }
1843: z += bs;
1844: }
1845: }
1846: PetscCall(MatDenseRestoreArray(C, &c));
1847: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1848: PetscFunctionReturn(PETSC_SUCCESS);
1849: }