Actual source code: sbaij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
11: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12: const PetscInt *idx;
13: PetscBT table_out, table_in;
15: PetscFunctionBegin;
16: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscCall(PetscBTCreate(mbs, &table_out));
22: PetscCall(PetscMalloc1(mbs + 1, &nidx));
23: PetscCall(PetscBTCreate(mbs, &table_in));
25: for (i = 0; i < is_max; i++) { /* for each is */
26: isz = 0;
27: PetscCall(PetscBTMemzero(mbs, table_out));
29: /* Extract the indices, assume there can be duplicate entries */
30: PetscCall(ISGetIndices(is[i], &idx));
31: PetscCall(ISGetLocalSize(is[i], &n));
33: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34: bcol_max = 0;
35: for (j = 0; j < n; ++j) {
36: brow = idx[j] / bs; /* convert the indices into block indices */
37: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38: if (!PetscBTLookupSet(table_out, brow)) {
39: nidx[isz++] = brow;
40: if (bcol_max < brow) bcol_max = brow;
41: }
42: }
43: PetscCall(ISRestoreIndices(is[i], &idx));
44: PetscCall(ISDestroy(&is[i]));
46: k = 0;
47: for (j = 0; j < ov; j++) { /* for each overlap */
48: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49: PetscCall(PetscBTMemzero(mbs, table_in));
50: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52: n = isz; /* length of the updated is[i] */
53: for (brow = 0; brow < mbs; brow++) {
54: start = ai[brow];
55: end = ai[brow + 1];
56: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l < end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out, bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l < end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in, bcol)) {
71: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
78: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79: } /* for each is */
80: PetscCall(PetscBTDestroy(&table_out));
81: PetscCall(PetscFree(nidx));
82: PetscCall(PetscBTDestroy(&table_in));
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87: Zero some ops' to avoid invalid use */
88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89: {
90: PetscFunctionBegin;
91: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92: Bseq->ops->mult = NULL;
93: Bseq->ops->multadd = NULL;
94: Bseq->ops->multtranspose = NULL;
95: Bseq->ops->multtransposeadd = NULL;
96: Bseq->ops->lufactor = NULL;
97: Bseq->ops->choleskyfactor = NULL;
98: Bseq->ops->lufactorsymbolic = NULL;
99: Bseq->ops->choleskyfactorsymbolic = NULL;
100: Bseq->ops->getinertia = NULL;
101: PetscFunctionReturn(PETSC_SUCCESS);
102: }
104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108: Mat_SeqBAIJ *d = NULL;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
120: PetscCall(ISGetIndices(isrow, &irow));
121: PetscCall(ISGetIndices(iscol, &icol));
122: PetscCall(ISGetLocalSize(isrow, &nrows));
123: PetscCall(ISGetLocalSize(iscol, &ncols));
125: PetscCall(PetscCalloc1(1 + oldcols, &smap));
126: ssmap = smap;
127: PetscCall(PetscMalloc1(1 + nrows, &lens));
128: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129: /* determine lens of each row */
130: for (i = 0; i < nrows; i++) {
131: kstart = ai[irow[i]];
132: kend = kstart + a->ilen[irow[i]];
133: lens[i] = 0;
134: for (k = kstart; k < kend; k++) {
135: if (ssmap[aj[k]]) lens[i]++;
136: }
137: }
138: /* Create and fill new matrix */
139: if (scall == MAT_REUSE_MATRIX) {
140: if (sym) {
141: c = (Mat_SeqSBAIJ *)((*B)->data);
143: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
144: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
145: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
146: PetscCall(PetscArrayzero(c->ilen, c->mbs));
147: } else {
148: d = (Mat_SeqBAIJ *)((*B)->data);
150: PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
151: PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
152: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
153: PetscCall(PetscArrayzero(d->ilen, d->mbs));
154: }
155: C = *B;
156: } else {
157: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
158: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
159: if (sym) {
160: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
161: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
162: } else {
163: PetscCall(MatSetType(C, MATSEQBAIJ));
164: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
165: }
166: }
167: if (sym) c = (Mat_SeqSBAIJ *)(C->data);
168: else d = (Mat_SeqBAIJ *)(C->data);
169: for (i = 0; i < nrows; i++) {
170: row = irow[i];
171: kstart = ai[row];
172: kend = kstart + a->ilen[row];
173: if (sym) {
174: mat_i = c->i[i];
175: mat_j = c->j + mat_i;
176: mat_a = c->a + mat_i * bs2;
177: mat_ilen = c->ilen + i;
178: } else {
179: mat_i = d->i[i];
180: mat_j = d->j + mat_i;
181: mat_a = d->a + mat_i * bs2;
182: mat_ilen = d->ilen + i;
183: }
184: for (k = kstart; k < kend; k++) {
185: if ((tcol = ssmap[a->j[k]])) {
186: *mat_j++ = tcol - 1;
187: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
188: mat_a += bs2;
189: (*mat_ilen)++;
190: }
191: }
192: }
193: /* sort */
194: {
195: MatScalar *work;
197: PetscCall(PetscMalloc1(bs2, &work));
198: for (i = 0; i < nrows; i++) {
199: PetscInt ilen;
200: if (sym) {
201: mat_i = c->i[i];
202: mat_j = c->j + mat_i;
203: mat_a = c->a + mat_i * bs2;
204: ilen = c->ilen[i];
205: } else {
206: mat_i = d->i[i];
207: mat_j = d->j + mat_i;
208: mat_a = d->a + mat_i * bs2;
209: ilen = d->ilen[i];
210: }
211: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
212: }
213: PetscCall(PetscFree(work));
214: }
216: /* Free work space */
217: PetscCall(ISRestoreIndices(iscol, &icol));
218: PetscCall(PetscFree(smap));
219: PetscCall(PetscFree(lens));
220: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
221: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
223: PetscCall(ISRestoreIndices(isrow, &irow));
224: *B = C;
225: PetscFunctionReturn(PETSC_SUCCESS);
226: }
228: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
229: {
230: Mat C[2];
231: IS is1, is2, intersect = NULL;
232: PetscInt n1, n2, ni;
233: PetscBool sym = PETSC_TRUE;
235: PetscFunctionBegin;
236: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
237: if (isrow == iscol) {
238: is2 = is1;
239: PetscCall(PetscObjectReference((PetscObject)is2));
240: } else {
241: PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
242: PetscCall(ISIntersect(is1, is2, &intersect));
243: PetscCall(ISGetLocalSize(intersect, &ni));
244: PetscCall(ISDestroy(&intersect));
245: if (ni == 0) sym = PETSC_FALSE;
246: else if (PetscDefined(USE_DEBUG)) {
247: PetscCall(ISGetLocalSize(is1, &n1));
248: PetscCall(ISGetLocalSize(is2, &n2));
249: PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
250: }
251: }
252: if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
253: else {
254: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
255: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
256: PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
257: PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
258: PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
259: if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
260: else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
261: else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
262: PetscCall(MatDestroy(C));
263: PetscCall(MatDestroy(C + 1));
264: }
265: PetscCall(ISDestroy(&is1));
266: PetscCall(ISDestroy(&is2));
268: if (sym && isrow != iscol) {
269: PetscBool isequal;
270: PetscCall(ISEqual(isrow, iscol, &isequal));
271: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
272: }
273: PetscFunctionReturn(PETSC_SUCCESS);
274: }
276: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
277: {
278: PetscInt i;
280: PetscFunctionBegin;
281: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
283: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
284: PetscFunctionReturn(PETSC_SUCCESS);
285: }
287: /* Should check that shapes of vectors and matrices match */
288: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
289: {
290: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
291: PetscScalar *z, x1, x2, zero = 0.0;
292: const PetscScalar *x, *xb;
293: const MatScalar *v;
294: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
295: const PetscInt *aj = a->j, *ai = a->i, *ib;
296: PetscInt nonzerorow = 0;
298: PetscFunctionBegin;
299: PetscCall(VecSet(zz, zero));
300: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
301: PetscCall(VecGetArrayRead(xx, &x));
302: PetscCall(VecGetArray(zz, &z));
304: v = a->a;
305: xb = x;
307: for (i = 0; i < mbs; i++) {
308: n = ai[1] - ai[0]; /* length of i_th block row of A */
309: x1 = xb[0];
310: x2 = xb[1];
311: ib = aj + *ai;
312: jmin = 0;
313: nonzerorow += (n > 0);
314: if (*ib == i) { /* (diag of A)*x */
315: z[2 * i] += v[0] * x1 + v[2] * x2;
316: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
317: v += 4;
318: jmin++;
319: }
320: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
321: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
322: for (j = jmin; j < n; j++) {
323: /* (strict lower triangular part of A)*x */
324: cval = ib[j] * 2;
325: z[cval] += v[0] * x1 + v[1] * x2;
326: z[cval + 1] += v[2] * x1 + v[3] * x2;
327: /* (strict upper triangular part of A)*x */
328: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
329: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
330: v += 4;
331: }
332: xb += 2;
333: ai++;
334: }
336: PetscCall(VecRestoreArrayRead(xx, &x));
337: PetscCall(VecRestoreArray(zz, &z));
338: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
343: {
344: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
345: PetscScalar *z, x1, x2, x3, zero = 0.0;
346: const PetscScalar *x, *xb;
347: const MatScalar *v;
348: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
349: const PetscInt *aj = a->j, *ai = a->i, *ib;
350: PetscInt nonzerorow = 0;
352: PetscFunctionBegin;
353: PetscCall(VecSet(zz, zero));
354: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
355: PetscCall(VecGetArrayRead(xx, &x));
356: PetscCall(VecGetArray(zz, &z));
358: v = a->a;
359: xb = x;
361: for (i = 0; i < mbs; i++) {
362: n = ai[1] - ai[0]; /* length of i_th block row of A */
363: x1 = xb[0];
364: x2 = xb[1];
365: x3 = xb[2];
366: ib = aj + *ai;
367: jmin = 0;
368: nonzerorow += (n > 0);
369: if (*ib == i) { /* (diag of A)*x */
370: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
371: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
372: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
373: v += 9;
374: jmin++;
375: }
376: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
377: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378: for (j = jmin; j < n; j++) {
379: /* (strict lower triangular part of A)*x */
380: cval = ib[j] * 3;
381: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
382: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
383: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
384: /* (strict upper triangular part of A)*x */
385: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
386: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
387: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
388: v += 9;
389: }
390: xb += 3;
391: ai++;
392: }
394: PetscCall(VecRestoreArrayRead(xx, &x));
395: PetscCall(VecRestoreArray(zz, &z));
396: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
397: PetscFunctionReturn(PETSC_SUCCESS);
398: }
400: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
401: {
402: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
403: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
404: const PetscScalar *x, *xb;
405: const MatScalar *v;
406: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
407: const PetscInt *aj = a->j, *ai = a->i, *ib;
408: PetscInt nonzerorow = 0;
410: PetscFunctionBegin;
411: PetscCall(VecSet(zz, zero));
412: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
413: PetscCall(VecGetArrayRead(xx, &x));
414: PetscCall(VecGetArray(zz, &z));
416: v = a->a;
417: xb = x;
419: for (i = 0; i < mbs; i++) {
420: n = ai[1] - ai[0]; /* length of i_th block row of A */
421: x1 = xb[0];
422: x2 = xb[1];
423: x3 = xb[2];
424: x4 = xb[3];
425: ib = aj + *ai;
426: jmin = 0;
427: nonzerorow += (n > 0);
428: if (*ib == i) { /* (diag of A)*x */
429: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
430: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
431: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
432: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
433: v += 16;
434: jmin++;
435: }
436: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
437: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
438: for (j = jmin; j < n; j++) {
439: /* (strict lower triangular part of A)*x */
440: cval = ib[j] * 4;
441: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
442: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
443: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
444: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
445: /* (strict upper triangular part of A)*x */
446: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
447: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
448: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
449: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
450: v += 16;
451: }
452: xb += 4;
453: ai++;
454: }
456: PetscCall(VecRestoreArrayRead(xx, &x));
457: PetscCall(VecRestoreArray(zz, &z));
458: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
459: PetscFunctionReturn(PETSC_SUCCESS);
460: }
462: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
463: {
464: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
465: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
466: const PetscScalar *x, *xb;
467: const MatScalar *v;
468: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
469: const PetscInt *aj = a->j, *ai = a->i, *ib;
470: PetscInt nonzerorow = 0;
472: PetscFunctionBegin;
473: PetscCall(VecSet(zz, zero));
474: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
475: PetscCall(VecGetArrayRead(xx, &x));
476: PetscCall(VecGetArray(zz, &z));
478: v = a->a;
479: xb = x;
481: for (i = 0; i < mbs; i++) {
482: n = ai[1] - ai[0]; /* length of i_th block row of A */
483: x1 = xb[0];
484: x2 = xb[1];
485: x3 = xb[2];
486: x4 = xb[3];
487: x5 = xb[4];
488: ib = aj + *ai;
489: jmin = 0;
490: nonzerorow += (n > 0);
491: if (*ib == i) { /* (diag of A)*x */
492: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
493: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
494: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
495: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
496: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
497: v += 25;
498: jmin++;
499: }
500: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
501: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502: for (j = jmin; j < n; j++) {
503: /* (strict lower triangular part of A)*x */
504: cval = ib[j] * 5;
505: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
506: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
507: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
508: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
509: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
510: /* (strict upper triangular part of A)*x */
511: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
512: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
513: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
514: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
515: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
516: v += 25;
517: }
518: xb += 5;
519: ai++;
520: }
522: PetscCall(VecRestoreArrayRead(xx, &x));
523: PetscCall(VecRestoreArray(zz, &z));
524: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
525: PetscFunctionReturn(PETSC_SUCCESS);
526: }
528: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
529: {
530: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
531: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
532: const PetscScalar *x, *xb;
533: const MatScalar *v;
534: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
535: const PetscInt *aj = a->j, *ai = a->i, *ib;
536: PetscInt nonzerorow = 0;
538: PetscFunctionBegin;
539: PetscCall(VecSet(zz, zero));
540: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
541: PetscCall(VecGetArrayRead(xx, &x));
542: PetscCall(VecGetArray(zz, &z));
544: v = a->a;
545: xb = x;
547: for (i = 0; i < mbs; i++) {
548: n = ai[1] - ai[0]; /* length of i_th block row of A */
549: x1 = xb[0];
550: x2 = xb[1];
551: x3 = xb[2];
552: x4 = xb[3];
553: x5 = xb[4];
554: x6 = xb[5];
555: ib = aj + *ai;
556: jmin = 0;
557: nonzerorow += (n > 0);
558: if (*ib == i) { /* (diag of A)*x */
559: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
560: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
561: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
562: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
563: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
564: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
565: v += 36;
566: jmin++;
567: }
568: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
569: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
570: for (j = jmin; j < n; j++) {
571: /* (strict lower triangular part of A)*x */
572: cval = ib[j] * 6;
573: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
574: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
575: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
576: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
577: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
578: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
579: /* (strict upper triangular part of A)*x */
580: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
581: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
582: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
583: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
584: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
585: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
586: v += 36;
587: }
588: xb += 6;
589: ai++;
590: }
592: PetscCall(VecRestoreArrayRead(xx, &x));
593: PetscCall(VecRestoreArray(zz, &z));
594: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
595: PetscFunctionReturn(PETSC_SUCCESS);
596: }
598: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
599: {
600: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
601: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
602: const PetscScalar *x, *xb;
603: const MatScalar *v;
604: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
605: const PetscInt *aj = a->j, *ai = a->i, *ib;
606: PetscInt nonzerorow = 0;
608: PetscFunctionBegin;
609: PetscCall(VecSet(zz, zero));
610: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
611: PetscCall(VecGetArrayRead(xx, &x));
612: PetscCall(VecGetArray(zz, &z));
614: v = a->a;
615: xb = x;
617: for (i = 0; i < mbs; i++) {
618: n = ai[1] - ai[0]; /* length of i_th block row of A */
619: x1 = xb[0];
620: x2 = xb[1];
621: x3 = xb[2];
622: x4 = xb[3];
623: x5 = xb[4];
624: x6 = xb[5];
625: x7 = xb[6];
626: ib = aj + *ai;
627: jmin = 0;
628: nonzerorow += (n > 0);
629: if (*ib == i) { /* (diag of A)*x */
630: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
631: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
632: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
633: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
634: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
635: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
636: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
637: v += 49;
638: jmin++;
639: }
640: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
641: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
642: for (j = jmin; j < n; j++) {
643: /* (strict lower triangular part of A)*x */
644: cval = ib[j] * 7;
645: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
646: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
647: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
648: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
649: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
650: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
651: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
652: /* (strict upper triangular part of A)*x */
653: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
654: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
655: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
656: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
657: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
658: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
659: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
660: v += 49;
661: }
662: xb += 7;
663: ai++;
664: }
665: PetscCall(VecRestoreArrayRead(xx, &x));
666: PetscCall(VecRestoreArray(zz, &z));
667: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
668: PetscFunctionReturn(PETSC_SUCCESS);
669: }
671: /*
672: This will not work with MatScalar == float because it calls the BLAS
673: */
674: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
675: {
676: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
677: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
678: const PetscScalar *x, *x_ptr, *xb;
679: const MatScalar *v;
680: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
681: const PetscInt *idx, *aj, *ii;
682: PetscInt nonzerorow = 0;
684: PetscFunctionBegin;
685: PetscCall(VecSet(zz, zero));
686: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
687: PetscCall(VecGetArrayRead(xx, &x));
688: PetscCall(VecGetArray(zz, &z));
690: x_ptr = x;
691: z_ptr = z;
693: aj = a->j;
694: v = a->a;
695: ii = a->i;
697: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
698: work = a->mult_work;
700: for (i = 0; i < mbs; i++) {
701: n = ii[1] - ii[0];
702: ncols = n * bs;
703: workt = work;
704: idx = aj + ii[0];
705: nonzerorow += (n > 0);
707: /* upper triangular part */
708: for (j = 0; j < n; j++) {
709: xb = x_ptr + bs * (*idx++);
710: for (k = 0; k < bs; k++) workt[k] = xb[k];
711: workt += bs;
712: }
713: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
714: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
716: /* strict lower triangular part */
717: idx = aj + ii[0];
718: if (n && *idx == i) {
719: ncols -= bs;
720: v += bs2;
721: idx++;
722: n--;
723: }
725: if (ncols > 0) {
726: workt = work;
727: PetscCall(PetscArrayzero(workt, ncols));
728: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
729: for (j = 0; j < n; j++) {
730: zb = z_ptr + bs * (*idx++);
731: for (k = 0; k < bs; k++) zb[k] += workt[k];
732: workt += bs;
733: }
734: }
735: x += bs;
736: v += n * bs2;
737: z += bs;
738: ii++;
739: }
741: PetscCall(VecRestoreArrayRead(xx, &x));
742: PetscCall(VecRestoreArray(zz, &z));
743: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
744: PetscFunctionReturn(PETSC_SUCCESS);
745: }
747: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
748: {
749: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
750: PetscScalar *z, x1;
751: const PetscScalar *x, *xb;
752: const MatScalar *v;
753: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
754: const PetscInt *aj = a->j, *ai = a->i, *ib;
755: PetscInt nonzerorow = 0;
756: #if defined(PETSC_USE_COMPLEX)
757: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
758: #else
759: const int aconj = 0;
760: #endif
762: PetscFunctionBegin;
763: PetscCall(VecCopy(yy, zz));
764: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
765: PetscCall(VecGetArrayRead(xx, &x));
766: PetscCall(VecGetArray(zz, &z));
767: v = a->a;
768: xb = x;
770: for (i = 0; i < mbs; i++) {
771: n = ai[1] - ai[0]; /* length of i_th row of A */
772: x1 = xb[0];
773: ib = aj + *ai;
774: jmin = 0;
775: nonzerorow += (n > 0);
776: if (n && *ib == i) { /* (diag of A)*x */
777: z[i] += *v++ * x[*ib++];
778: jmin++;
779: }
780: if (aconj) {
781: for (j = jmin; j < n; j++) {
782: cval = *ib;
783: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
784: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
785: }
786: } else {
787: for (j = jmin; j < n; j++) {
788: cval = *ib;
789: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
790: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
791: }
792: }
793: xb++;
794: ai++;
795: }
797: PetscCall(VecRestoreArrayRead(xx, &x));
798: PetscCall(VecRestoreArray(zz, &z));
800: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
801: PetscFunctionReturn(PETSC_SUCCESS);
802: }
804: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
805: {
806: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
807: PetscScalar *z, x1, x2;
808: const PetscScalar *x, *xb;
809: const MatScalar *v;
810: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
811: const PetscInt *aj = a->j, *ai = a->i, *ib;
812: PetscInt nonzerorow = 0;
814: PetscFunctionBegin;
815: PetscCall(VecCopy(yy, zz));
816: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
817: PetscCall(VecGetArrayRead(xx, &x));
818: PetscCall(VecGetArray(zz, &z));
820: v = a->a;
821: xb = x;
823: for (i = 0; i < mbs; i++) {
824: n = ai[1] - ai[0]; /* length of i_th block row of A */
825: x1 = xb[0];
826: x2 = xb[1];
827: ib = aj + *ai;
828: jmin = 0;
829: nonzerorow += (n > 0);
830: if (n && *ib == i) { /* (diag of A)*x */
831: z[2 * i] += v[0] * x1 + v[2] * x2;
832: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
833: v += 4;
834: jmin++;
835: }
836: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
837: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
838: for (j = jmin; j < n; j++) {
839: /* (strict lower triangular part of A)*x */
840: cval = ib[j] * 2;
841: z[cval] += v[0] * x1 + v[1] * x2;
842: z[cval + 1] += v[2] * x1 + v[3] * x2;
843: /* (strict upper triangular part of A)*x */
844: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
845: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
846: v += 4;
847: }
848: xb += 2;
849: ai++;
850: }
851: PetscCall(VecRestoreArrayRead(xx, &x));
852: PetscCall(VecRestoreArray(zz, &z));
854: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
855: PetscFunctionReturn(PETSC_SUCCESS);
856: }
858: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
859: {
860: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
861: PetscScalar *z, x1, x2, x3;
862: const PetscScalar *x, *xb;
863: const MatScalar *v;
864: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
865: const PetscInt *aj = a->j, *ai = a->i, *ib;
866: PetscInt nonzerorow = 0;
868: PetscFunctionBegin;
869: PetscCall(VecCopy(yy, zz));
870: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
871: PetscCall(VecGetArrayRead(xx, &x));
872: PetscCall(VecGetArray(zz, &z));
874: v = a->a;
875: xb = x;
877: for (i = 0; i < mbs; i++) {
878: n = ai[1] - ai[0]; /* length of i_th block row of A */
879: x1 = xb[0];
880: x2 = xb[1];
881: x3 = xb[2];
882: ib = aj + *ai;
883: jmin = 0;
884: nonzerorow += (n > 0);
885: if (n && *ib == i) { /* (diag of A)*x */
886: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
887: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
888: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
889: v += 9;
890: jmin++;
891: }
892: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
893: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
894: for (j = jmin; j < n; j++) {
895: /* (strict lower triangular part of A)*x */
896: cval = ib[j] * 3;
897: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
898: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
899: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
900: /* (strict upper triangular part of A)*x */
901: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
902: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
903: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
904: v += 9;
905: }
906: xb += 3;
907: ai++;
908: }
910: PetscCall(VecRestoreArrayRead(xx, &x));
911: PetscCall(VecRestoreArray(zz, &z));
913: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
914: PetscFunctionReturn(PETSC_SUCCESS);
915: }
917: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
918: {
919: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
920: PetscScalar *z, x1, x2, x3, x4;
921: const PetscScalar *x, *xb;
922: const MatScalar *v;
923: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
924: const PetscInt *aj = a->j, *ai = a->i, *ib;
925: PetscInt nonzerorow = 0;
927: PetscFunctionBegin;
928: PetscCall(VecCopy(yy, zz));
929: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
930: PetscCall(VecGetArrayRead(xx, &x));
931: PetscCall(VecGetArray(zz, &z));
933: v = a->a;
934: xb = x;
936: for (i = 0; i < mbs; i++) {
937: n = ai[1] - ai[0]; /* length of i_th block row of A */
938: x1 = xb[0];
939: x2 = xb[1];
940: x3 = xb[2];
941: x4 = xb[3];
942: ib = aj + *ai;
943: jmin = 0;
944: nonzerorow += (n > 0);
945: if (n && *ib == i) { /* (diag of A)*x */
946: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
947: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
948: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
949: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
950: v += 16;
951: jmin++;
952: }
953: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
954: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
955: for (j = jmin; j < n; j++) {
956: /* (strict lower triangular part of A)*x */
957: cval = ib[j] * 4;
958: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
959: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
960: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
961: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
962: /* (strict upper triangular part of A)*x */
963: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
964: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
965: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
966: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
967: v += 16;
968: }
969: xb += 4;
970: ai++;
971: }
973: PetscCall(VecRestoreArrayRead(xx, &x));
974: PetscCall(VecRestoreArray(zz, &z));
976: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
977: PetscFunctionReturn(PETSC_SUCCESS);
978: }
980: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
981: {
982: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
983: PetscScalar *z, x1, x2, x3, x4, x5;
984: const PetscScalar *x, *xb;
985: const MatScalar *v;
986: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
987: const PetscInt *aj = a->j, *ai = a->i, *ib;
988: PetscInt nonzerorow = 0;
990: PetscFunctionBegin;
991: PetscCall(VecCopy(yy, zz));
992: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
993: PetscCall(VecGetArrayRead(xx, &x));
994: PetscCall(VecGetArray(zz, &z));
996: v = a->a;
997: xb = x;
999: for (i = 0; i < mbs; i++) {
1000: n = ai[1] - ai[0]; /* length of i_th block row of A */
1001: x1 = xb[0];
1002: x2 = xb[1];
1003: x3 = xb[2];
1004: x4 = xb[3];
1005: x5 = xb[4];
1006: ib = aj + *ai;
1007: jmin = 0;
1008: nonzerorow += (n > 0);
1009: if (n && *ib == i) { /* (diag of A)*x */
1010: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1011: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1012: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1013: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1014: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1015: v += 25;
1016: jmin++;
1017: }
1018: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1019: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1020: for (j = jmin; j < n; j++) {
1021: /* (strict lower triangular part of A)*x */
1022: cval = ib[j] * 5;
1023: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1024: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1025: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1026: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1027: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1028: /* (strict upper triangular part of A)*x */
1029: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1030: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1031: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1032: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1033: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1034: v += 25;
1035: }
1036: xb += 5;
1037: ai++;
1038: }
1040: PetscCall(VecRestoreArrayRead(xx, &x));
1041: PetscCall(VecRestoreArray(zz, &z));
1043: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1044: PetscFunctionReturn(PETSC_SUCCESS);
1045: }
1047: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1048: {
1049: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1050: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1051: const PetscScalar *x, *xb;
1052: const MatScalar *v;
1053: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1054: const PetscInt *aj = a->j, *ai = a->i, *ib;
1055: PetscInt nonzerorow = 0;
1057: PetscFunctionBegin;
1058: PetscCall(VecCopy(yy, zz));
1059: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1060: PetscCall(VecGetArrayRead(xx, &x));
1061: PetscCall(VecGetArray(zz, &z));
1063: v = a->a;
1064: xb = x;
1066: for (i = 0; i < mbs; i++) {
1067: n = ai[1] - ai[0]; /* length of i_th block row of A */
1068: x1 = xb[0];
1069: x2 = xb[1];
1070: x3 = xb[2];
1071: x4 = xb[3];
1072: x5 = xb[4];
1073: x6 = xb[5];
1074: ib = aj + *ai;
1075: jmin = 0;
1076: nonzerorow += (n > 0);
1077: if (n && *ib == i) { /* (diag of A)*x */
1078: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1079: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1080: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1081: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1082: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1083: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1084: v += 36;
1085: jmin++;
1086: }
1087: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1088: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1089: for (j = jmin; j < n; j++) {
1090: /* (strict lower triangular part of A)*x */
1091: cval = ib[j] * 6;
1092: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1093: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1094: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1095: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1096: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1097: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1098: /* (strict upper triangular part of A)*x */
1099: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1100: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1101: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1102: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1103: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1104: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1105: v += 36;
1106: }
1107: xb += 6;
1108: ai++;
1109: }
1111: PetscCall(VecRestoreArrayRead(xx, &x));
1112: PetscCall(VecRestoreArray(zz, &z));
1114: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1115: PetscFunctionReturn(PETSC_SUCCESS);
1116: }
1118: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1119: {
1120: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1121: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1122: const PetscScalar *x, *xb;
1123: const MatScalar *v;
1124: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1125: const PetscInt *aj = a->j, *ai = a->i, *ib;
1126: PetscInt nonzerorow = 0;
1128: PetscFunctionBegin;
1129: PetscCall(VecCopy(yy, zz));
1130: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1131: PetscCall(VecGetArrayRead(xx, &x));
1132: PetscCall(VecGetArray(zz, &z));
1134: v = a->a;
1135: xb = x;
1137: for (i = 0; i < mbs; i++) {
1138: n = ai[1] - ai[0]; /* length of i_th block row of A */
1139: x1 = xb[0];
1140: x2 = xb[1];
1141: x3 = xb[2];
1142: x4 = xb[3];
1143: x5 = xb[4];
1144: x6 = xb[5];
1145: x7 = xb[6];
1146: ib = aj + *ai;
1147: jmin = 0;
1148: nonzerorow += (n > 0);
1149: if (n && *ib == i) { /* (diag of A)*x */
1150: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1151: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1152: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1153: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1154: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1155: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1156: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1157: v += 49;
1158: jmin++;
1159: }
1160: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1161: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1162: for (j = jmin; j < n; j++) {
1163: /* (strict lower triangular part of A)*x */
1164: cval = ib[j] * 7;
1165: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1166: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1167: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1168: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1169: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1170: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1171: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1172: /* (strict upper triangular part of A)*x */
1173: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1174: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1175: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1176: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1177: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1178: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1179: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1180: v += 49;
1181: }
1182: xb += 7;
1183: ai++;
1184: }
1186: PetscCall(VecRestoreArrayRead(xx, &x));
1187: PetscCall(VecRestoreArray(zz, &z));
1189: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1190: PetscFunctionReturn(PETSC_SUCCESS);
1191: }
1193: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1194: {
1195: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1196: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1197: const PetscScalar *x, *x_ptr, *xb;
1198: const MatScalar *v;
1199: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1200: const PetscInt *idx, *aj, *ii;
1201: PetscInt nonzerorow = 0;
1203: PetscFunctionBegin;
1204: PetscCall(VecCopy(yy, zz));
1205: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1206: PetscCall(VecGetArrayRead(xx, &x));
1207: x_ptr = x;
1208: PetscCall(VecGetArray(zz, &z));
1209: z_ptr = z;
1211: aj = a->j;
1212: v = a->a;
1213: ii = a->i;
1215: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1216: work = a->mult_work;
1218: for (i = 0; i < mbs; i++) {
1219: n = ii[1] - ii[0];
1220: ncols = n * bs;
1221: workt = work;
1222: idx = aj + ii[0];
1223: nonzerorow += (n > 0);
1225: /* upper triangular part */
1226: for (j = 0; j < n; j++) {
1227: xb = x_ptr + bs * (*idx++);
1228: for (k = 0; k < bs; k++) workt[k] = xb[k];
1229: workt += bs;
1230: }
1231: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1232: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1234: /* strict lower triangular part */
1235: idx = aj + ii[0];
1236: if (n && *idx == i) {
1237: ncols -= bs;
1238: v += bs2;
1239: idx++;
1240: n--;
1241: }
1242: if (ncols > 0) {
1243: workt = work;
1244: PetscCall(PetscArrayzero(workt, ncols));
1245: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1246: for (j = 0; j < n; j++) {
1247: zb = z_ptr + bs * (*idx++);
1248: for (k = 0; k < bs; k++) zb[k] += workt[k];
1249: workt += bs;
1250: }
1251: }
1253: x += bs;
1254: v += n * bs2;
1255: z += bs;
1256: ii++;
1257: }
1259: PetscCall(VecRestoreArrayRead(xx, &x));
1260: PetscCall(VecRestoreArray(zz, &z));
1262: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1263: PetscFunctionReturn(PETSC_SUCCESS);
1264: }
1266: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1267: {
1268: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1269: PetscScalar oalpha = alpha;
1270: PetscBLASInt one = 1, totalnz;
1272: PetscFunctionBegin;
1273: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1274: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1275: PetscCall(PetscLogFlops(totalnz));
1276: PetscFunctionReturn(PETSC_SUCCESS);
1277: }
1279: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1280: {
1281: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1282: const MatScalar *v = a->a;
1283: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1284: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1285: const PetscInt *aj = a->j, *col;
1287: PetscFunctionBegin;
1288: if (!a->nz) {
1289: *norm = 0.0;
1290: PetscFunctionReturn(PETSC_SUCCESS);
1291: }
1292: if (type == NORM_FROBENIUS) {
1293: for (k = 0; k < mbs; k++) {
1294: jmin = a->i[k];
1295: jmax = a->i[k + 1];
1296: col = aj + jmin;
1297: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1298: for (i = 0; i < bs2; i++) {
1299: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1300: v++;
1301: }
1302: jmin++;
1303: }
1304: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1305: for (i = 0; i < bs2; i++) {
1306: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1307: v++;
1308: }
1309: }
1310: }
1311: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1312: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1313: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1314: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1315: for (i = 0; i < mbs; i++) jl[i] = mbs;
1316: il[0] = 0;
1318: *norm = 0.0;
1319: for (k = 0; k < mbs; k++) { /* k_th block row */
1320: for (j = 0; j < bs; j++) sum[j] = 0.0;
1321: /*-- col sum --*/
1322: i = jl[k]; /* first |A(i,k)| to be added */
1323: /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1324: at step k */
1325: while (i < mbs) {
1326: nexti = jl[i]; /* next block row to be added */
1327: ik = il[i]; /* block index of A(i,k) in the array a */
1328: for (j = 0; j < bs; j++) {
1329: v = a->a + ik * bs2 + j * bs;
1330: for (k1 = 0; k1 < bs; k1++) {
1331: sum[j] += PetscAbsScalar(*v);
1332: v++;
1333: }
1334: }
1335: /* update il, jl */
1336: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1337: jmax = a->i[i + 1];
1338: if (jmin < jmax) {
1339: il[i] = jmin;
1340: j = a->j[jmin];
1341: jl[i] = jl[j];
1342: jl[j] = i;
1343: }
1344: i = nexti;
1345: }
1346: /*-- row sum --*/
1347: jmin = a->i[k];
1348: jmax = a->i[k + 1];
1349: for (i = jmin; i < jmax; i++) {
1350: for (j = 0; j < bs; j++) {
1351: v = a->a + i * bs2 + j;
1352: for (k1 = 0; k1 < bs; k1++) {
1353: sum[j] += PetscAbsScalar(*v);
1354: v += bs;
1355: }
1356: }
1357: }
1358: /* add k_th block row to il, jl */
1359: col = aj + jmin;
1360: if (jmax - jmin > 0 && *col == k) jmin++;
1361: if (jmin < jmax) {
1362: il[k] = jmin;
1363: j = a->j[jmin];
1364: jl[k] = jl[j];
1365: jl[j] = k;
1366: }
1367: for (j = 0; j < bs; j++) {
1368: if (sum[j] > *norm) *norm = sum[j];
1369: }
1370: }
1371: PetscCall(PetscFree3(sum, il, jl));
1372: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1373: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1374: PetscFunctionReturn(PETSC_SUCCESS);
1375: }
1377: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1378: {
1379: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1381: PetscFunctionBegin;
1382: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1383: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1384: *flg = PETSC_FALSE;
1385: PetscFunctionReturn(PETSC_SUCCESS);
1386: }
1388: /* if the a->i are the same */
1389: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1390: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1392: /* if a->j are the same */
1393: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1394: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1396: /* if a->a are the same */
1397: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1398: PetscFunctionReturn(PETSC_SUCCESS);
1399: }
1401: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1402: {
1403: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1404: PetscInt i, j, k, row, bs, ambs, bs2;
1405: const PetscInt *ai, *aj;
1406: PetscScalar *x, zero = 0.0;
1407: const MatScalar *aa, *aa_j;
1409: PetscFunctionBegin;
1410: bs = A->rmap->bs;
1411: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1413: aa = a->a;
1414: ambs = a->mbs;
1416: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1417: PetscInt *diag = a->diag;
1418: aa = a->a;
1419: ambs = a->mbs;
1420: PetscCall(VecGetArray(v, &x));
1421: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1422: PetscCall(VecRestoreArray(v, &x));
1423: PetscFunctionReturn(PETSC_SUCCESS);
1424: }
1426: ai = a->i;
1427: aj = a->j;
1428: bs2 = a->bs2;
1429: PetscCall(VecSet(v, zero));
1430: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1431: PetscCall(VecGetArray(v, &x));
1432: for (i = 0; i < ambs; i++) {
1433: j = ai[i];
1434: if (aj[j] == i) { /* if this is a diagonal element */
1435: row = i * bs;
1436: aa_j = aa + j * bs2;
1437: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1438: }
1439: }
1440: PetscCall(VecRestoreArray(v, &x));
1441: PetscFunctionReturn(PETSC_SUCCESS);
1442: }
1444: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1445: {
1446: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1447: PetscScalar x;
1448: const PetscScalar *l, *li, *ri;
1449: MatScalar *aa, *v;
1450: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1451: const PetscInt *ai, *aj;
1452: PetscBool flg;
1454: PetscFunctionBegin;
1455: if (ll != rr) {
1456: PetscCall(VecEqual(ll, rr, &flg));
1457: PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1458: }
1459: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1460: ai = a->i;
1461: aj = a->j;
1462: aa = a->a;
1463: m = A->rmap->N;
1464: bs = A->rmap->bs;
1465: mbs = a->mbs;
1466: bs2 = a->bs2;
1468: PetscCall(VecGetArrayRead(ll, &l));
1469: PetscCall(VecGetLocalSize(ll, &lm));
1470: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1471: for (i = 0; i < mbs; i++) { /* for each block row */
1472: M = ai[i + 1] - ai[i];
1473: li = l + i * bs;
1474: v = aa + bs2 * ai[i];
1475: for (j = 0; j < M; j++) { /* for each block */
1476: ri = l + bs * aj[ai[i] + j];
1477: for (k = 0; k < bs; k++) {
1478: x = ri[k];
1479: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1480: }
1481: }
1482: }
1483: PetscCall(VecRestoreArrayRead(ll, &l));
1484: PetscCall(PetscLogFlops(2.0 * a->nz));
1485: PetscFunctionReturn(PETSC_SUCCESS);
1486: }
1488: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1489: {
1490: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1492: PetscFunctionBegin;
1493: info->block_size = a->bs2;
1494: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1495: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1496: info->nz_unneeded = info->nz_allocated - info->nz_used;
1497: info->assemblies = A->num_ass;
1498: info->mallocs = A->info.mallocs;
1499: info->memory = 0; /* REVIEW ME */
1500: if (A->factortype) {
1501: info->fill_ratio_given = A->info.fill_ratio_given;
1502: info->fill_ratio_needed = A->info.fill_ratio_needed;
1503: info->factor_mallocs = A->info.factor_mallocs;
1504: } else {
1505: info->fill_ratio_given = 0;
1506: info->fill_ratio_needed = 0;
1507: info->factor_mallocs = 0;
1508: }
1509: PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1512: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1513: {
1514: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1516: PetscFunctionBegin;
1517: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1518: PetscFunctionReturn(PETSC_SUCCESS);
1519: }
1521: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1522: {
1523: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1524: PetscInt i, j, n, row, col, bs, mbs;
1525: const PetscInt *ai, *aj;
1526: PetscReal atmp;
1527: const MatScalar *aa;
1528: PetscScalar *x;
1529: PetscInt ncols, brow, bcol, krow, kcol;
1531: PetscFunctionBegin;
1532: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1533: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1534: bs = A->rmap->bs;
1535: aa = a->a;
1536: ai = a->i;
1537: aj = a->j;
1538: mbs = a->mbs;
1540: PetscCall(VecSet(v, 0.0));
1541: PetscCall(VecGetArray(v, &x));
1542: PetscCall(VecGetLocalSize(v, &n));
1543: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1544: for (i = 0; i < mbs; i++) {
1545: ncols = ai[1] - ai[0];
1546: ai++;
1547: brow = bs * i;
1548: for (j = 0; j < ncols; j++) {
1549: bcol = bs * (*aj);
1550: for (kcol = 0; kcol < bs; kcol++) {
1551: col = bcol + kcol; /* col index */
1552: for (krow = 0; krow < bs; krow++) {
1553: atmp = PetscAbsScalar(*aa);
1554: aa++;
1555: row = brow + krow; /* row index */
1556: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1557: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1558: }
1559: }
1560: aj++;
1561: }
1562: }
1563: PetscCall(VecRestoreArray(v, &x));
1564: PetscFunctionReturn(PETSC_SUCCESS);
1565: }
1567: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1568: {
1569: PetscFunctionBegin;
1570: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1571: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1572: PetscFunctionReturn(PETSC_SUCCESS);
1573: }
1575: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1576: {
1577: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1578: PetscScalar *z = c;
1579: const PetscScalar *xb;
1580: PetscScalar x1;
1581: const MatScalar *v = a->a, *vv;
1582: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1583: #if defined(PETSC_USE_COMPLEX)
1584: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1585: #else
1586: const int aconj = 0;
1587: #endif
1589: PetscFunctionBegin;
1590: for (i = 0; i < mbs; i++) {
1591: n = ii[1] - ii[0];
1592: ii++;
1593: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1594: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1595: jj = idx;
1596: vv = v;
1597: for (k = 0; k < cn; k++) {
1598: idx = jj;
1599: v = vv;
1600: for (j = 0; j < n; j++) {
1601: xb = b + (*idx);
1602: x1 = xb[0 + k * bm];
1603: z[0 + k * cm] += v[0] * x1;
1604: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1605: v += 1;
1606: ++idx;
1607: }
1608: }
1609: z += 1;
1610: }
1611: PetscFunctionReturn(PETSC_SUCCESS);
1612: }
1614: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1615: {
1616: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1617: PetscScalar *z = c;
1618: const PetscScalar *xb;
1619: PetscScalar x1, x2;
1620: const MatScalar *v = a->a, *vv;
1621: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1623: PetscFunctionBegin;
1624: for (i = 0; i < mbs; i++) {
1625: n = ii[1] - ii[0];
1626: ii++;
1627: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1628: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1629: jj = idx;
1630: vv = v;
1631: for (k = 0; k < cn; k++) {
1632: idx = jj;
1633: v = vv;
1634: for (j = 0; j < n; j++) {
1635: xb = b + 2 * (*idx);
1636: x1 = xb[0 + k * bm];
1637: x2 = xb[1 + k * bm];
1638: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1639: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1640: if (*idx != i) {
1641: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1642: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1643: }
1644: v += 4;
1645: ++idx;
1646: }
1647: }
1648: z += 2;
1649: }
1650: PetscFunctionReturn(PETSC_SUCCESS);
1651: }
1653: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1654: {
1655: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1656: PetscScalar *z = c;
1657: const PetscScalar *xb;
1658: PetscScalar x1, x2, x3;
1659: const MatScalar *v = a->a, *vv;
1660: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1662: PetscFunctionBegin;
1663: for (i = 0; i < mbs; i++) {
1664: n = ii[1] - ii[0];
1665: ii++;
1666: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1667: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1668: jj = idx;
1669: vv = v;
1670: for (k = 0; k < cn; k++) {
1671: idx = jj;
1672: v = vv;
1673: for (j = 0; j < n; j++) {
1674: xb = b + 3 * (*idx);
1675: x1 = xb[0 + k * bm];
1676: x2 = xb[1 + k * bm];
1677: x3 = xb[2 + k * bm];
1678: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1679: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1680: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1681: if (*idx != i) {
1682: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1683: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1684: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1685: }
1686: v += 9;
1687: ++idx;
1688: }
1689: }
1690: z += 3;
1691: }
1692: PetscFunctionReturn(PETSC_SUCCESS);
1693: }
1695: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1696: {
1697: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1698: PetscScalar *z = c;
1699: const PetscScalar *xb;
1700: PetscScalar x1, x2, x3, x4;
1701: const MatScalar *v = a->a, *vv;
1702: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1704: PetscFunctionBegin;
1705: for (i = 0; i < mbs; i++) {
1706: n = ii[1] - ii[0];
1707: ii++;
1708: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1709: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1710: jj = idx;
1711: vv = v;
1712: for (k = 0; k < cn; k++) {
1713: idx = jj;
1714: v = vv;
1715: for (j = 0; j < n; j++) {
1716: xb = b + 4 * (*idx);
1717: x1 = xb[0 + k * bm];
1718: x2 = xb[1 + k * bm];
1719: x3 = xb[2 + k * bm];
1720: x4 = xb[3 + k * bm];
1721: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1722: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1723: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1724: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1725: if (*idx != i) {
1726: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1727: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1728: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1729: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1730: }
1731: v += 16;
1732: ++idx;
1733: }
1734: }
1735: z += 4;
1736: }
1737: PetscFunctionReturn(PETSC_SUCCESS);
1738: }
1740: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1741: {
1742: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1743: PetscScalar *z = c;
1744: const PetscScalar *xb;
1745: PetscScalar x1, x2, x3, x4, x5;
1746: const MatScalar *v = a->a, *vv;
1747: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1749: PetscFunctionBegin;
1750: for (i = 0; i < mbs; i++) {
1751: n = ii[1] - ii[0];
1752: ii++;
1753: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1754: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1755: jj = idx;
1756: vv = v;
1757: for (k = 0; k < cn; k++) {
1758: idx = jj;
1759: v = vv;
1760: for (j = 0; j < n; j++) {
1761: xb = b + 5 * (*idx);
1762: x1 = xb[0 + k * bm];
1763: x2 = xb[1 + k * bm];
1764: x3 = xb[2 + k * bm];
1765: x4 = xb[3 + k * bm];
1766: x5 = xb[4 + k * cm];
1767: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1768: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1769: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1770: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1771: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1772: if (*idx != i) {
1773: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1774: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1775: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1776: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1777: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1778: }
1779: v += 25;
1780: ++idx;
1781: }
1782: }
1783: z += 5;
1784: }
1785: PetscFunctionReturn(PETSC_SUCCESS);
1786: }
1788: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1789: {
1790: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1791: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1792: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1793: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1794: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1795: PetscBLASInt bbs, bcn, bbm, bcm;
1796: PetscScalar *z = NULL;
1797: PetscScalar *c, *b;
1798: const MatScalar *v;
1799: const PetscInt *idx, *ii;
1800: PetscScalar _DOne = 1.0;
1802: PetscFunctionBegin;
1803: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1804: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1805: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1806: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1807: b = bd->v;
1808: PetscCall(MatZeroEntries(C));
1809: PetscCall(MatDenseGetArray(C, &c));
1810: switch (bs) {
1811: case 1:
1812: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1813: break;
1814: case 2:
1815: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1816: break;
1817: case 3:
1818: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1819: break;
1820: case 4:
1821: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1822: break;
1823: case 5:
1824: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1825: break;
1826: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1827: PetscCall(PetscBLASIntCast(bs, &bbs));
1828: PetscCall(PetscBLASIntCast(cn, &bcn));
1829: PetscCall(PetscBLASIntCast(bm, &bbm));
1830: PetscCall(PetscBLASIntCast(cm, &bcm));
1831: idx = a->j;
1832: v = a->a;
1833: mbs = a->mbs;
1834: ii = a->i;
1835: z = c;
1836: for (i = 0; i < mbs; i++) {
1837: n = ii[1] - ii[0];
1838: ii++;
1839: for (j = 0; j < n; j++) {
1840: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1841: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1842: v += bs2;
1843: }
1844: z += bs;
1845: }
1846: }
1847: PetscCall(MatDenseRestoreArray(C, &c));
1848: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1849: PetscFunctionReturn(PETSC_SUCCESS);
1850: }