Actual source code: sbaij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  4: #include <petsc/private/kernels/blockinvert.h>
  5: #include <petscbt.h>
  6: #include <petscblaslapack.h>

  8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
  9: {
 10:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 11:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
 12:   const PetscInt *idx;
 13:   PetscBT         table_out, table_in;

 15:   PetscFunctionBegin;
 16:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 17:   mbs = a->mbs;
 18:   ai  = a->i;
 19:   aj  = a->j;
 20:   bs  = A->rmap->bs;
 21:   PetscCall(PetscBTCreate(mbs, &table_out));
 22:   PetscCall(PetscMalloc1(mbs + 1, &nidx));
 23:   PetscCall(PetscBTCreate(mbs, &table_in));

 25:   for (i = 0; i < is_max; i++) { /* for each is */
 26:     isz = 0;
 27:     PetscCall(PetscBTMemzero(mbs, table_out));

 29:     /* Extract the indices, assume there can be duplicate entries */
 30:     PetscCall(ISGetIndices(is[i], &idx));
 31:     PetscCall(ISGetLocalSize(is[i], &n));

 33:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 34:     bcol_max = 0;
 35:     for (j = 0; j < n; ++j) {
 36:       brow = idx[j] / bs; /* convert the indices into block indices */
 37:       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 38:       if (!PetscBTLookupSet(table_out, brow)) {
 39:         nidx[isz++] = brow;
 40:         if (bcol_max < brow) bcol_max = brow;
 41:       }
 42:     }
 43:     PetscCall(ISRestoreIndices(is[i], &idx));
 44:     PetscCall(ISDestroy(&is[i]));

 46:     k = 0;
 47:     for (j = 0; j < ov; j++) { /* for each overlap */
 48:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 49:       PetscCall(PetscBTMemzero(mbs, table_in));
 50:       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));

 52:       n = isz; /* length of the updated is[i] */
 53:       for (brow = 0; brow < mbs; brow++) {
 54:         start = ai[brow];
 55:         end   = ai[brow + 1];
 56:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 57:           for (l = start; l < end; l++) {
 58:             bcol = aj[l];
 59:             if (!PetscBTLookupSet(table_out, bcol)) {
 60:               nidx[isz++] = bcol;
 61:               if (bcol_max < bcol) bcol_max = bcol;
 62:             }
 63:           }
 64:           k++;
 65:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 66:         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
 67:           for (l = start; l < end; l++) {
 68:             bcol = aj[l];
 69:             if (bcol > bcol_max) break;
 70:             if (PetscBTLookup(table_in, bcol)) {
 71:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 72:               break; /* for l = start; l<end ; l++) */
 73:             }
 74:           }
 75:         }
 76:       }
 77:     } /* for each overlap */
 78:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 79:   } /* for each is */
 80:   PetscCall(PetscBTDestroy(&table_out));
 81:   PetscCall(PetscFree(nidx));
 82:   PetscCall(PetscBTDestroy(&table_in));
 83:   PetscFunctionReturn(PETSC_SUCCESS);
 84: }

 86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 87:         Zero some ops' to avoid invalid use */
 88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 89: {
 90:   PetscFunctionBegin;
 91:   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
 92:   Bseq->ops->mult                   = NULL;
 93:   Bseq->ops->multadd                = NULL;
 94:   Bseq->ops->multtranspose          = NULL;
 95:   Bseq->ops->multtransposeadd       = NULL;
 96:   Bseq->ops->lufactor               = NULL;
 97:   Bseq->ops->choleskyfactor         = NULL;
 98:   Bseq->ops->lufactorsymbolic       = NULL;
 99:   Bseq->ops->choleskyfactorsymbolic = NULL;
100:   Bseq->ops->getinertia             = NULL;
101:   PetscFunctionReturn(PETSC_SUCCESS);
102: }

104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108:   Mat_SeqBAIJ    *d = NULL;
109:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111:   const PetscInt *irow, *icol;
112:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113:   PetscInt       *aj = a->j, *ai = a->i;
114:   MatScalar      *mat_a;
115:   Mat             C;
116:   PetscBool       flag;

118:   PetscFunctionBegin;

120:   PetscCall(ISGetIndices(isrow, &irow));
121:   PetscCall(ISGetIndices(iscol, &icol));
122:   PetscCall(ISGetLocalSize(isrow, &nrows));
123:   PetscCall(ISGetLocalSize(iscol, &ncols));

125:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126:   ssmap = smap;
127:   PetscCall(PetscMalloc1(1 + nrows, &lens));
128:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129:   /* determine lens of each row */
130:   for (i = 0; i < nrows; i++) {
131:     kstart  = ai[irow[i]];
132:     kend    = kstart + a->ilen[irow[i]];
133:     lens[i] = 0;
134:     for (k = kstart; k < kend; k++) {
135:       if (ssmap[aj[k]]) lens[i]++;
136:     }
137:   }
138:   /* Create and fill new matrix */
139:   if (scall == MAT_REUSE_MATRIX) {
140:     if (sym) {
141:       c = (Mat_SeqSBAIJ *)((*B)->data);

143:       PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
144:       PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
145:       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
146:       PetscCall(PetscArrayzero(c->ilen, c->mbs));
147:     } else {
148:       d = (Mat_SeqBAIJ *)((*B)->data);

150:       PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
151:       PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
152:       PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
153:       PetscCall(PetscArrayzero(d->ilen, d->mbs));
154:     }
155:     C = *B;
156:   } else {
157:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
158:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
159:     if (sym) {
160:       PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
161:       PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
162:     } else {
163:       PetscCall(MatSetType(C, MATSEQBAIJ));
164:       PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
165:     }
166:   }
167:   if (sym) c = (Mat_SeqSBAIJ *)(C->data);
168:   else d = (Mat_SeqBAIJ *)(C->data);
169:   for (i = 0; i < nrows; i++) {
170:     row    = irow[i];
171:     kstart = ai[row];
172:     kend   = kstart + a->ilen[row];
173:     if (sym) {
174:       mat_i    = c->i[i];
175:       mat_j    = c->j + mat_i;
176:       mat_a    = c->a + mat_i * bs2;
177:       mat_ilen = c->ilen + i;
178:     } else {
179:       mat_i    = d->i[i];
180:       mat_j    = d->j + mat_i;
181:       mat_a    = d->a + mat_i * bs2;
182:       mat_ilen = d->ilen + i;
183:     }
184:     for (k = kstart; k < kend; k++) {
185:       if ((tcol = ssmap[a->j[k]])) {
186:         *mat_j++ = tcol - 1;
187:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
188:         mat_a += bs2;
189:         (*mat_ilen)++;
190:       }
191:     }
192:   }
193:   /* sort */
194:   {
195:     MatScalar *work;

197:     PetscCall(PetscMalloc1(bs2, &work));
198:     for (i = 0; i < nrows; i++) {
199:       PetscInt ilen;
200:       if (sym) {
201:         mat_i = c->i[i];
202:         mat_j = c->j + mat_i;
203:         mat_a = c->a + mat_i * bs2;
204:         ilen  = c->ilen[i];
205:       } else {
206:         mat_i = d->i[i];
207:         mat_j = d->j + mat_i;
208:         mat_a = d->a + mat_i * bs2;
209:         ilen  = d->ilen[i];
210:       }
211:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
212:     }
213:     PetscCall(PetscFree(work));
214:   }

216:   /* Free work space */
217:   PetscCall(ISRestoreIndices(iscol, &icol));
218:   PetscCall(PetscFree(smap));
219:   PetscCall(PetscFree(lens));
220:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
221:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

223:   PetscCall(ISRestoreIndices(isrow, &irow));
224:   *B = C;
225:   PetscFunctionReturn(PETSC_SUCCESS);
226: }

228: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
229: {
230:   Mat       C[2];
231:   IS        is1, is2, intersect = NULL;
232:   PetscInt  n1, n2, ni;
233:   PetscBool sym = PETSC_TRUE;

235:   PetscFunctionBegin;
236:   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
237:   if (isrow == iscol) {
238:     is2 = is1;
239:     PetscCall(PetscObjectReference((PetscObject)is2));
240:   } else {
241:     PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
242:     PetscCall(ISIntersect(is1, is2, &intersect));
243:     PetscCall(ISGetLocalSize(intersect, &ni));
244:     PetscCall(ISDestroy(&intersect));
245:     if (ni == 0) sym = PETSC_FALSE;
246:     else if (PetscDefined(USE_DEBUG)) {
247:       PetscCall(ISGetLocalSize(is1, &n1));
248:       PetscCall(ISGetLocalSize(is2, &n2));
249:       PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
250:     }
251:   }
252:   if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
253:   else {
254:     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
255:     PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
256:     PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
257:     PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
258:     PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
259:     if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
260:     else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
261:     else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
262:     PetscCall(MatDestroy(C));
263:     PetscCall(MatDestroy(C + 1));
264:   }
265:   PetscCall(ISDestroy(&is1));
266:   PetscCall(ISDestroy(&is2));

268:   if (sym && isrow != iscol) {
269:     PetscBool isequal;
270:     PetscCall(ISEqual(isrow, iscol, &isequal));
271:     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
272:   }
273:   PetscFunctionReturn(PETSC_SUCCESS);
274: }

276: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
277: {
278:   PetscInt i;

280:   PetscFunctionBegin;
281:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));

283:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
284:   PetscFunctionReturn(PETSC_SUCCESS);
285: }

287: /* Should check that shapes of vectors and matrices match */
288: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
289: {
290:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
291:   PetscScalar       *z, x1, x2, zero = 0.0;
292:   const PetscScalar *x, *xb;
293:   const MatScalar   *v;
294:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
295:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
296:   PetscInt           nonzerorow = 0;

298:   PetscFunctionBegin;
299:   PetscCall(VecSet(zz, zero));
300:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
301:   PetscCall(VecGetArrayRead(xx, &x));
302:   PetscCall(VecGetArray(zz, &z));

304:   v  = a->a;
305:   xb = x;

307:   for (i = 0; i < mbs; i++) {
308:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
309:     x1   = xb[0];
310:     x2   = xb[1];
311:     ib   = aj + *ai;
312:     jmin = 0;
313:     nonzerorow += (n > 0);
314:     if (*ib == i) { /* (diag of A)*x */
315:       z[2 * i] += v[0] * x1 + v[2] * x2;
316:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
317:       v += 4;
318:       jmin++;
319:     }
320:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
321:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
322:     for (j = jmin; j < n; j++) {
323:       /* (strict lower triangular part of A)*x  */
324:       cval = ib[j] * 2;
325:       z[cval] += v[0] * x1 + v[1] * x2;
326:       z[cval + 1] += v[2] * x1 + v[3] * x2;
327:       /* (strict upper triangular part of A)*x  */
328:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
329:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
330:       v += 4;
331:     }
332:     xb += 2;
333:     ai++;
334:   }

336:   PetscCall(VecRestoreArrayRead(xx, &x));
337:   PetscCall(VecRestoreArray(zz, &z));
338:   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }

342: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
343: {
344:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
345:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
346:   const PetscScalar *x, *xb;
347:   const MatScalar   *v;
348:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
349:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
350:   PetscInt           nonzerorow = 0;

352:   PetscFunctionBegin;
353:   PetscCall(VecSet(zz, zero));
354:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
355:   PetscCall(VecGetArrayRead(xx, &x));
356:   PetscCall(VecGetArray(zz, &z));

358:   v  = a->a;
359:   xb = x;

361:   for (i = 0; i < mbs; i++) {
362:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
363:     x1   = xb[0];
364:     x2   = xb[1];
365:     x3   = xb[2];
366:     ib   = aj + *ai;
367:     jmin = 0;
368:     nonzerorow += (n > 0);
369:     if (*ib == i) { /* (diag of A)*x */
370:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
371:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
372:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
373:       v += 9;
374:       jmin++;
375:     }
376:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
377:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378:     for (j = jmin; j < n; j++) {
379:       /* (strict lower triangular part of A)*x  */
380:       cval = ib[j] * 3;
381:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
382:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
383:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
384:       /* (strict upper triangular part of A)*x  */
385:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
386:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
387:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
388:       v += 9;
389:     }
390:     xb += 3;
391:     ai++;
392:   }

394:   PetscCall(VecRestoreArrayRead(xx, &x));
395:   PetscCall(VecRestoreArray(zz, &z));
396:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
397:   PetscFunctionReturn(PETSC_SUCCESS);
398: }

400: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
401: {
402:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
403:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
404:   const PetscScalar *x, *xb;
405:   const MatScalar   *v;
406:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
407:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
408:   PetscInt           nonzerorow = 0;

410:   PetscFunctionBegin;
411:   PetscCall(VecSet(zz, zero));
412:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
413:   PetscCall(VecGetArrayRead(xx, &x));
414:   PetscCall(VecGetArray(zz, &z));

416:   v  = a->a;
417:   xb = x;

419:   for (i = 0; i < mbs; i++) {
420:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
421:     x1   = xb[0];
422:     x2   = xb[1];
423:     x3   = xb[2];
424:     x4   = xb[3];
425:     ib   = aj + *ai;
426:     jmin = 0;
427:     nonzerorow += (n > 0);
428:     if (*ib == i) { /* (diag of A)*x */
429:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
430:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
431:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
432:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
433:       v += 16;
434:       jmin++;
435:     }
436:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
437:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
438:     for (j = jmin; j < n; j++) {
439:       /* (strict lower triangular part of A)*x  */
440:       cval = ib[j] * 4;
441:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
442:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
443:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
444:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
445:       /* (strict upper triangular part of A)*x  */
446:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
447:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
448:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
449:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
450:       v += 16;
451:     }
452:     xb += 4;
453:     ai++;
454:   }

456:   PetscCall(VecRestoreArrayRead(xx, &x));
457:   PetscCall(VecRestoreArray(zz, &z));
458:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
459:   PetscFunctionReturn(PETSC_SUCCESS);
460: }

462: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
463: {
464:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
465:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
466:   const PetscScalar *x, *xb;
467:   const MatScalar   *v;
468:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
469:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
470:   PetscInt           nonzerorow = 0;

472:   PetscFunctionBegin;
473:   PetscCall(VecSet(zz, zero));
474:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
475:   PetscCall(VecGetArrayRead(xx, &x));
476:   PetscCall(VecGetArray(zz, &z));

478:   v  = a->a;
479:   xb = x;

481:   for (i = 0; i < mbs; i++) {
482:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
483:     x1   = xb[0];
484:     x2   = xb[1];
485:     x3   = xb[2];
486:     x4   = xb[3];
487:     x5   = xb[4];
488:     ib   = aj + *ai;
489:     jmin = 0;
490:     nonzerorow += (n > 0);
491:     if (*ib == i) { /* (diag of A)*x */
492:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
493:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
494:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
495:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
496:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
497:       v += 25;
498:       jmin++;
499:     }
500:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
501:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502:     for (j = jmin; j < n; j++) {
503:       /* (strict lower triangular part of A)*x  */
504:       cval = ib[j] * 5;
505:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
506:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
507:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
508:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
509:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
510:       /* (strict upper triangular part of A)*x  */
511:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
512:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
513:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
514:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
515:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
516:       v += 25;
517:     }
518:     xb += 5;
519:     ai++;
520:   }

522:   PetscCall(VecRestoreArrayRead(xx, &x));
523:   PetscCall(VecRestoreArray(zz, &z));
524:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
525:   PetscFunctionReturn(PETSC_SUCCESS);
526: }

528: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
529: {
530:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
531:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
532:   const PetscScalar *x, *xb;
533:   const MatScalar   *v;
534:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
535:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
536:   PetscInt           nonzerorow = 0;

538:   PetscFunctionBegin;
539:   PetscCall(VecSet(zz, zero));
540:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
541:   PetscCall(VecGetArrayRead(xx, &x));
542:   PetscCall(VecGetArray(zz, &z));

544:   v  = a->a;
545:   xb = x;

547:   for (i = 0; i < mbs; i++) {
548:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
549:     x1   = xb[0];
550:     x2   = xb[1];
551:     x3   = xb[2];
552:     x4   = xb[3];
553:     x5   = xb[4];
554:     x6   = xb[5];
555:     ib   = aj + *ai;
556:     jmin = 0;
557:     nonzerorow += (n > 0);
558:     if (*ib == i) { /* (diag of A)*x */
559:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
560:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
561:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
562:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
563:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
564:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
565:       v += 36;
566:       jmin++;
567:     }
568:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
569:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
570:     for (j = jmin; j < n; j++) {
571:       /* (strict lower triangular part of A)*x  */
572:       cval = ib[j] * 6;
573:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
574:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
575:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
576:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
577:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
578:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
579:       /* (strict upper triangular part of A)*x  */
580:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
581:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
582:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
583:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
584:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
585:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
586:       v += 36;
587:     }
588:     xb += 6;
589:     ai++;
590:   }

592:   PetscCall(VecRestoreArrayRead(xx, &x));
593:   PetscCall(VecRestoreArray(zz, &z));
594:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
595:   PetscFunctionReturn(PETSC_SUCCESS);
596: }

598: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
599: {
600:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
601:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
602:   const PetscScalar *x, *xb;
603:   const MatScalar   *v;
604:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
605:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
606:   PetscInt           nonzerorow = 0;

608:   PetscFunctionBegin;
609:   PetscCall(VecSet(zz, zero));
610:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
611:   PetscCall(VecGetArrayRead(xx, &x));
612:   PetscCall(VecGetArray(zz, &z));

614:   v  = a->a;
615:   xb = x;

617:   for (i = 0; i < mbs; i++) {
618:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
619:     x1   = xb[0];
620:     x2   = xb[1];
621:     x3   = xb[2];
622:     x4   = xb[3];
623:     x5   = xb[4];
624:     x6   = xb[5];
625:     x7   = xb[6];
626:     ib   = aj + *ai;
627:     jmin = 0;
628:     nonzerorow += (n > 0);
629:     if (*ib == i) { /* (diag of A)*x */
630:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
631:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
632:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
633:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
634:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
635:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
636:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
637:       v += 49;
638:       jmin++;
639:     }
640:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
641:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
642:     for (j = jmin; j < n; j++) {
643:       /* (strict lower triangular part of A)*x  */
644:       cval = ib[j] * 7;
645:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
646:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
647:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
648:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
649:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
650:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
651:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
652:       /* (strict upper triangular part of A)*x  */
653:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
654:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
655:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
656:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
657:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
658:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
659:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
660:       v += 49;
661:     }
662:     xb += 7;
663:     ai++;
664:   }
665:   PetscCall(VecRestoreArrayRead(xx, &x));
666:   PetscCall(VecRestoreArray(zz, &z));
667:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
668:   PetscFunctionReturn(PETSC_SUCCESS);
669: }

671: /*
672:     This will not work with MatScalar == float because it calls the BLAS
673: */
674: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
675: {
676:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
677:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
678:   const PetscScalar *x, *x_ptr, *xb;
679:   const MatScalar   *v;
680:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
681:   const PetscInt    *idx, *aj, *ii;
682:   PetscInt           nonzerorow = 0;

684:   PetscFunctionBegin;
685:   PetscCall(VecSet(zz, zero));
686:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
687:   PetscCall(VecGetArrayRead(xx, &x));
688:   PetscCall(VecGetArray(zz, &z));

690:   x_ptr = x;
691:   z_ptr = z;

693:   aj = a->j;
694:   v  = a->a;
695:   ii = a->i;

697:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
698:   work = a->mult_work;

700:   for (i = 0; i < mbs; i++) {
701:     n     = ii[1] - ii[0];
702:     ncols = n * bs;
703:     workt = work;
704:     idx   = aj + ii[0];
705:     nonzerorow += (n > 0);

707:     /* upper triangular part */
708:     for (j = 0; j < n; j++) {
709:       xb = x_ptr + bs * (*idx++);
710:       for (k = 0; k < bs; k++) workt[k] = xb[k];
711:       workt += bs;
712:     }
713:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
714:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

716:     /* strict lower triangular part */
717:     idx = aj + ii[0];
718:     if (n && *idx == i) {
719:       ncols -= bs;
720:       v += bs2;
721:       idx++;
722:       n--;
723:     }

725:     if (ncols > 0) {
726:       workt = work;
727:       PetscCall(PetscArrayzero(workt, ncols));
728:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
729:       for (j = 0; j < n; j++) {
730:         zb = z_ptr + bs * (*idx++);
731:         for (k = 0; k < bs; k++) zb[k] += workt[k];
732:         workt += bs;
733:       }
734:     }
735:     x += bs;
736:     v += n * bs2;
737:     z += bs;
738:     ii++;
739:   }

741:   PetscCall(VecRestoreArrayRead(xx, &x));
742:   PetscCall(VecRestoreArray(zz, &z));
743:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
744:   PetscFunctionReturn(PETSC_SUCCESS);
745: }

747: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
748: {
749:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
750:   PetscScalar       *z, x1;
751:   const PetscScalar *x, *xb;
752:   const MatScalar   *v;
753:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
754:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
755:   PetscInt           nonzerorow = 0;
756: #if defined(PETSC_USE_COMPLEX)
757:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
758: #else
759:   const int aconj = 0;
760: #endif

762:   PetscFunctionBegin;
763:   PetscCall(VecCopy(yy, zz));
764:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
765:   PetscCall(VecGetArrayRead(xx, &x));
766:   PetscCall(VecGetArray(zz, &z));
767:   v  = a->a;
768:   xb = x;

770:   for (i = 0; i < mbs; i++) {
771:     n    = ai[1] - ai[0]; /* length of i_th row of A */
772:     x1   = xb[0];
773:     ib   = aj + *ai;
774:     jmin = 0;
775:     nonzerorow += (n > 0);
776:     if (n && *ib == i) { /* (diag of A)*x */
777:       z[i] += *v++ * x[*ib++];
778:       jmin++;
779:     }
780:     if (aconj) {
781:       for (j = jmin; j < n; j++) {
782:         cval = *ib;
783:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
784:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
785:       }
786:     } else {
787:       for (j = jmin; j < n; j++) {
788:         cval = *ib;
789:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
790:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
791:       }
792:     }
793:     xb++;
794:     ai++;
795:   }

797:   PetscCall(VecRestoreArrayRead(xx, &x));
798:   PetscCall(VecRestoreArray(zz, &z));

800:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
801:   PetscFunctionReturn(PETSC_SUCCESS);
802: }

804: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
805: {
806:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
807:   PetscScalar       *z, x1, x2;
808:   const PetscScalar *x, *xb;
809:   const MatScalar   *v;
810:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
811:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
812:   PetscInt           nonzerorow = 0;

814:   PetscFunctionBegin;
815:   PetscCall(VecCopy(yy, zz));
816:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
817:   PetscCall(VecGetArrayRead(xx, &x));
818:   PetscCall(VecGetArray(zz, &z));

820:   v  = a->a;
821:   xb = x;

823:   for (i = 0; i < mbs; i++) {
824:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
825:     x1   = xb[0];
826:     x2   = xb[1];
827:     ib   = aj + *ai;
828:     jmin = 0;
829:     nonzerorow += (n > 0);
830:     if (n && *ib == i) { /* (diag of A)*x */
831:       z[2 * i] += v[0] * x1 + v[2] * x2;
832:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
833:       v += 4;
834:       jmin++;
835:     }
836:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
837:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
838:     for (j = jmin; j < n; j++) {
839:       /* (strict lower triangular part of A)*x  */
840:       cval = ib[j] * 2;
841:       z[cval] += v[0] * x1 + v[1] * x2;
842:       z[cval + 1] += v[2] * x1 + v[3] * x2;
843:       /* (strict upper triangular part of A)*x  */
844:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
845:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
846:       v += 4;
847:     }
848:     xb += 2;
849:     ai++;
850:   }
851:   PetscCall(VecRestoreArrayRead(xx, &x));
852:   PetscCall(VecRestoreArray(zz, &z));

854:   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
855:   PetscFunctionReturn(PETSC_SUCCESS);
856: }

858: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
859: {
860:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
861:   PetscScalar       *z, x1, x2, x3;
862:   const PetscScalar *x, *xb;
863:   const MatScalar   *v;
864:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
865:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
866:   PetscInt           nonzerorow = 0;

868:   PetscFunctionBegin;
869:   PetscCall(VecCopy(yy, zz));
870:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
871:   PetscCall(VecGetArrayRead(xx, &x));
872:   PetscCall(VecGetArray(zz, &z));

874:   v  = a->a;
875:   xb = x;

877:   for (i = 0; i < mbs; i++) {
878:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
879:     x1   = xb[0];
880:     x2   = xb[1];
881:     x3   = xb[2];
882:     ib   = aj + *ai;
883:     jmin = 0;
884:     nonzerorow += (n > 0);
885:     if (n && *ib == i) { /* (diag of A)*x */
886:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
887:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
888:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
889:       v += 9;
890:       jmin++;
891:     }
892:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
893:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
894:     for (j = jmin; j < n; j++) {
895:       /* (strict lower triangular part of A)*x  */
896:       cval = ib[j] * 3;
897:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
898:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
899:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
900:       /* (strict upper triangular part of A)*x  */
901:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
902:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
903:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
904:       v += 9;
905:     }
906:     xb += 3;
907:     ai++;
908:   }

910:   PetscCall(VecRestoreArrayRead(xx, &x));
911:   PetscCall(VecRestoreArray(zz, &z));

913:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
914:   PetscFunctionReturn(PETSC_SUCCESS);
915: }

917: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
918: {
919:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
920:   PetscScalar       *z, x1, x2, x3, x4;
921:   const PetscScalar *x, *xb;
922:   const MatScalar   *v;
923:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
924:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
925:   PetscInt           nonzerorow = 0;

927:   PetscFunctionBegin;
928:   PetscCall(VecCopy(yy, zz));
929:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
930:   PetscCall(VecGetArrayRead(xx, &x));
931:   PetscCall(VecGetArray(zz, &z));

933:   v  = a->a;
934:   xb = x;

936:   for (i = 0; i < mbs; i++) {
937:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
938:     x1   = xb[0];
939:     x2   = xb[1];
940:     x3   = xb[2];
941:     x4   = xb[3];
942:     ib   = aj + *ai;
943:     jmin = 0;
944:     nonzerorow += (n > 0);
945:     if (n && *ib == i) { /* (diag of A)*x */
946:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
947:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
948:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
949:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
950:       v += 16;
951:       jmin++;
952:     }
953:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
954:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
955:     for (j = jmin; j < n; j++) {
956:       /* (strict lower triangular part of A)*x  */
957:       cval = ib[j] * 4;
958:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
959:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
960:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
961:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
962:       /* (strict upper triangular part of A)*x  */
963:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
964:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
965:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
966:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
967:       v += 16;
968:     }
969:     xb += 4;
970:     ai++;
971:   }

973:   PetscCall(VecRestoreArrayRead(xx, &x));
974:   PetscCall(VecRestoreArray(zz, &z));

976:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
977:   PetscFunctionReturn(PETSC_SUCCESS);
978: }

980: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
981: {
982:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
983:   PetscScalar       *z, x1, x2, x3, x4, x5;
984:   const PetscScalar *x, *xb;
985:   const MatScalar   *v;
986:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
987:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
988:   PetscInt           nonzerorow = 0;

990:   PetscFunctionBegin;
991:   PetscCall(VecCopy(yy, zz));
992:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
993:   PetscCall(VecGetArrayRead(xx, &x));
994:   PetscCall(VecGetArray(zz, &z));

996:   v  = a->a;
997:   xb = x;

999:   for (i = 0; i < mbs; i++) {
1000:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1001:     x1   = xb[0];
1002:     x2   = xb[1];
1003:     x3   = xb[2];
1004:     x4   = xb[3];
1005:     x5   = xb[4];
1006:     ib   = aj + *ai;
1007:     jmin = 0;
1008:     nonzerorow += (n > 0);
1009:     if (n && *ib == i) { /* (diag of A)*x */
1010:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1011:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1012:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1013:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1014:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1015:       v += 25;
1016:       jmin++;
1017:     }
1018:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1019:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1020:     for (j = jmin; j < n; j++) {
1021:       /* (strict lower triangular part of A)*x  */
1022:       cval = ib[j] * 5;
1023:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1024:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1025:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1026:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1027:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1028:       /* (strict upper triangular part of A)*x  */
1029:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1030:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1031:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1032:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1033:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1034:       v += 25;
1035:     }
1036:     xb += 5;
1037:     ai++;
1038:   }

1040:   PetscCall(VecRestoreArrayRead(xx, &x));
1041:   PetscCall(VecRestoreArray(zz, &z));

1043:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1044:   PetscFunctionReturn(PETSC_SUCCESS);
1045: }

1047: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1048: {
1049:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1050:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1051:   const PetscScalar *x, *xb;
1052:   const MatScalar   *v;
1053:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1054:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1055:   PetscInt           nonzerorow = 0;

1057:   PetscFunctionBegin;
1058:   PetscCall(VecCopy(yy, zz));
1059:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1060:   PetscCall(VecGetArrayRead(xx, &x));
1061:   PetscCall(VecGetArray(zz, &z));

1063:   v  = a->a;
1064:   xb = x;

1066:   for (i = 0; i < mbs; i++) {
1067:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1068:     x1   = xb[0];
1069:     x2   = xb[1];
1070:     x3   = xb[2];
1071:     x4   = xb[3];
1072:     x5   = xb[4];
1073:     x6   = xb[5];
1074:     ib   = aj + *ai;
1075:     jmin = 0;
1076:     nonzerorow += (n > 0);
1077:     if (n && *ib == i) { /* (diag of A)*x */
1078:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1079:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1080:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1081:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1082:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1083:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1084:       v += 36;
1085:       jmin++;
1086:     }
1087:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1088:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1089:     for (j = jmin; j < n; j++) {
1090:       /* (strict lower triangular part of A)*x  */
1091:       cval = ib[j] * 6;
1092:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1093:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1094:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1095:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1096:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1097:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1098:       /* (strict upper triangular part of A)*x  */
1099:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1100:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1101:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1102:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1103:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1104:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1105:       v += 36;
1106:     }
1107:     xb += 6;
1108:     ai++;
1109:   }

1111:   PetscCall(VecRestoreArrayRead(xx, &x));
1112:   PetscCall(VecRestoreArray(zz, &z));

1114:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1115:   PetscFunctionReturn(PETSC_SUCCESS);
1116: }

1118: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1119: {
1120:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1121:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1122:   const PetscScalar *x, *xb;
1123:   const MatScalar   *v;
1124:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1125:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1126:   PetscInt           nonzerorow = 0;

1128:   PetscFunctionBegin;
1129:   PetscCall(VecCopy(yy, zz));
1130:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1131:   PetscCall(VecGetArrayRead(xx, &x));
1132:   PetscCall(VecGetArray(zz, &z));

1134:   v  = a->a;
1135:   xb = x;

1137:   for (i = 0; i < mbs; i++) {
1138:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1139:     x1   = xb[0];
1140:     x2   = xb[1];
1141:     x3   = xb[2];
1142:     x4   = xb[3];
1143:     x5   = xb[4];
1144:     x6   = xb[5];
1145:     x7   = xb[6];
1146:     ib   = aj + *ai;
1147:     jmin = 0;
1148:     nonzerorow += (n > 0);
1149:     if (n && *ib == i) { /* (diag of A)*x */
1150:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1151:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1152:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1153:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1154:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1155:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1156:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1157:       v += 49;
1158:       jmin++;
1159:     }
1160:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1161:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1162:     for (j = jmin; j < n; j++) {
1163:       /* (strict lower triangular part of A)*x  */
1164:       cval = ib[j] * 7;
1165:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1166:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1167:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1168:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1169:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1170:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1171:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1172:       /* (strict upper triangular part of A)*x  */
1173:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1174:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1175:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1176:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1177:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1178:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1179:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1180:       v += 49;
1181:     }
1182:     xb += 7;
1183:     ai++;
1184:   }

1186:   PetscCall(VecRestoreArrayRead(xx, &x));
1187:   PetscCall(VecRestoreArray(zz, &z));

1189:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1190:   PetscFunctionReturn(PETSC_SUCCESS);
1191: }

1193: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1194: {
1195:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1196:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1197:   const PetscScalar *x, *x_ptr, *xb;
1198:   const MatScalar   *v;
1199:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1200:   const PetscInt    *idx, *aj, *ii;
1201:   PetscInt           nonzerorow = 0;

1203:   PetscFunctionBegin;
1204:   PetscCall(VecCopy(yy, zz));
1205:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1206:   PetscCall(VecGetArrayRead(xx, &x));
1207:   x_ptr = x;
1208:   PetscCall(VecGetArray(zz, &z));
1209:   z_ptr = z;

1211:   aj = a->j;
1212:   v  = a->a;
1213:   ii = a->i;

1215:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1216:   work = a->mult_work;

1218:   for (i = 0; i < mbs; i++) {
1219:     n     = ii[1] - ii[0];
1220:     ncols = n * bs;
1221:     workt = work;
1222:     idx   = aj + ii[0];
1223:     nonzerorow += (n > 0);

1225:     /* upper triangular part */
1226:     for (j = 0; j < n; j++) {
1227:       xb = x_ptr + bs * (*idx++);
1228:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1229:       workt += bs;
1230:     }
1231:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1232:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1234:     /* strict lower triangular part */
1235:     idx = aj + ii[0];
1236:     if (n && *idx == i) {
1237:       ncols -= bs;
1238:       v += bs2;
1239:       idx++;
1240:       n--;
1241:     }
1242:     if (ncols > 0) {
1243:       workt = work;
1244:       PetscCall(PetscArrayzero(workt, ncols));
1245:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1246:       for (j = 0; j < n; j++) {
1247:         zb = z_ptr + bs * (*idx++);
1248:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1249:         workt += bs;
1250:       }
1251:     }

1253:     x += bs;
1254:     v += n * bs2;
1255:     z += bs;
1256:     ii++;
1257:   }

1259:   PetscCall(VecRestoreArrayRead(xx, &x));
1260:   PetscCall(VecRestoreArray(zz, &z));

1262:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1263:   PetscFunctionReturn(PETSC_SUCCESS);
1264: }

1266: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1267: {
1268:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1269:   PetscScalar   oalpha = alpha;
1270:   PetscBLASInt  one    = 1, totalnz;

1272:   PetscFunctionBegin;
1273:   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1274:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1275:   PetscCall(PetscLogFlops(totalnz));
1276:   PetscFunctionReturn(PETSC_SUCCESS);
1277: }

1279: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1280: {
1281:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1282:   const MatScalar *v        = a->a;
1283:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1284:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1285:   const PetscInt  *aj = a->j, *col;

1287:   PetscFunctionBegin;
1288:   if (!a->nz) {
1289:     *norm = 0.0;
1290:     PetscFunctionReturn(PETSC_SUCCESS);
1291:   }
1292:   if (type == NORM_FROBENIUS) {
1293:     for (k = 0; k < mbs; k++) {
1294:       jmin = a->i[k];
1295:       jmax = a->i[k + 1];
1296:       col  = aj + jmin;
1297:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1298:         for (i = 0; i < bs2; i++) {
1299:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1300:           v++;
1301:         }
1302:         jmin++;
1303:       }
1304:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1305:         for (i = 0; i < bs2; i++) {
1306:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1307:           v++;
1308:         }
1309:       }
1310:     }
1311:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1312:     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1313:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1314:     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1315:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1316:     il[0] = 0;

1318:     *norm = 0.0;
1319:     for (k = 0; k < mbs; k++) { /* k_th block row */
1320:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1321:       /*-- col sum --*/
1322:       i = jl[k]; /* first |A(i,k)| to be added */
1323:       /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1324:                   at step k */
1325:       while (i < mbs) {
1326:         nexti = jl[i]; /* next block row to be added */
1327:         ik    = il[i]; /* block index of A(i,k) in the array a */
1328:         for (j = 0; j < bs; j++) {
1329:           v = a->a + ik * bs2 + j * bs;
1330:           for (k1 = 0; k1 < bs; k1++) {
1331:             sum[j] += PetscAbsScalar(*v);
1332:             v++;
1333:           }
1334:         }
1335:         /* update il, jl */
1336:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1337:         jmax = a->i[i + 1];
1338:         if (jmin < jmax) {
1339:           il[i] = jmin;
1340:           j     = a->j[jmin];
1341:           jl[i] = jl[j];
1342:           jl[j] = i;
1343:         }
1344:         i = nexti;
1345:       }
1346:       /*-- row sum --*/
1347:       jmin = a->i[k];
1348:       jmax = a->i[k + 1];
1349:       for (i = jmin; i < jmax; i++) {
1350:         for (j = 0; j < bs; j++) {
1351:           v = a->a + i * bs2 + j;
1352:           for (k1 = 0; k1 < bs; k1++) {
1353:             sum[j] += PetscAbsScalar(*v);
1354:             v += bs;
1355:           }
1356:         }
1357:       }
1358:       /* add k_th block row to il, jl */
1359:       col = aj + jmin;
1360:       if (jmax - jmin > 0 && *col == k) jmin++;
1361:       if (jmin < jmax) {
1362:         il[k] = jmin;
1363:         j     = a->j[jmin];
1364:         jl[k] = jl[j];
1365:         jl[j] = k;
1366:       }
1367:       for (j = 0; j < bs; j++) {
1368:         if (sum[j] > *norm) *norm = sum[j];
1369:       }
1370:     }
1371:     PetscCall(PetscFree3(sum, il, jl));
1372:     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1373:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1374:   PetscFunctionReturn(PETSC_SUCCESS);
1375: }

1377: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1378: {
1379:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1381:   PetscFunctionBegin;
1382:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1383:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1384:     *flg = PETSC_FALSE;
1385:     PetscFunctionReturn(PETSC_SUCCESS);
1386:   }

1388:   /* if the a->i are the same */
1389:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1390:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1392:   /* if a->j are the same */
1393:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1394:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1396:   /* if a->a are the same */
1397:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1398:   PetscFunctionReturn(PETSC_SUCCESS);
1399: }

1401: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1402: {
1403:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1404:   PetscInt         i, j, k, row, bs, ambs, bs2;
1405:   const PetscInt  *ai, *aj;
1406:   PetscScalar     *x, zero = 0.0;
1407:   const MatScalar *aa, *aa_j;

1409:   PetscFunctionBegin;
1410:   bs = A->rmap->bs;
1411:   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");

1413:   aa   = a->a;
1414:   ambs = a->mbs;

1416:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1417:     PetscInt *diag = a->diag;
1418:     aa             = a->a;
1419:     ambs           = a->mbs;
1420:     PetscCall(VecGetArray(v, &x));
1421:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1422:     PetscCall(VecRestoreArray(v, &x));
1423:     PetscFunctionReturn(PETSC_SUCCESS);
1424:   }

1426:   ai  = a->i;
1427:   aj  = a->j;
1428:   bs2 = a->bs2;
1429:   PetscCall(VecSet(v, zero));
1430:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1431:   PetscCall(VecGetArray(v, &x));
1432:   for (i = 0; i < ambs; i++) {
1433:     j = ai[i];
1434:     if (aj[j] == i) { /* if this is a diagonal element */
1435:       row  = i * bs;
1436:       aa_j = aa + j * bs2;
1437:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1438:     }
1439:   }
1440:   PetscCall(VecRestoreArray(v, &x));
1441:   PetscFunctionReturn(PETSC_SUCCESS);
1442: }

1444: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1445: {
1446:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1447:   PetscScalar        x;
1448:   const PetscScalar *l, *li, *ri;
1449:   MatScalar         *aa, *v;
1450:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1451:   const PetscInt    *ai, *aj;
1452:   PetscBool          flg;

1454:   PetscFunctionBegin;
1455:   if (ll != rr) {
1456:     PetscCall(VecEqual(ll, rr, &flg));
1457:     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1458:   }
1459:   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1460:   ai  = a->i;
1461:   aj  = a->j;
1462:   aa  = a->a;
1463:   m   = A->rmap->N;
1464:   bs  = A->rmap->bs;
1465:   mbs = a->mbs;
1466:   bs2 = a->bs2;

1468:   PetscCall(VecGetArrayRead(ll, &l));
1469:   PetscCall(VecGetLocalSize(ll, &lm));
1470:   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1471:   for (i = 0; i < mbs; i++) { /* for each block row */
1472:     M  = ai[i + 1] - ai[i];
1473:     li = l + i * bs;
1474:     v  = aa + bs2 * ai[i];
1475:     for (j = 0; j < M; j++) { /* for each block */
1476:       ri = l + bs * aj[ai[i] + j];
1477:       for (k = 0; k < bs; k++) {
1478:         x = ri[k];
1479:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1480:       }
1481:     }
1482:   }
1483:   PetscCall(VecRestoreArrayRead(ll, &l));
1484:   PetscCall(PetscLogFlops(2.0 * a->nz));
1485:   PetscFunctionReturn(PETSC_SUCCESS);
1486: }

1488: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1489: {
1490:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1492:   PetscFunctionBegin;
1493:   info->block_size   = a->bs2;
1494:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1495:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1496:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1497:   info->assemblies   = A->num_ass;
1498:   info->mallocs      = A->info.mallocs;
1499:   info->memory       = 0; /* REVIEW ME */
1500:   if (A->factortype) {
1501:     info->fill_ratio_given  = A->info.fill_ratio_given;
1502:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1503:     info->factor_mallocs    = A->info.factor_mallocs;
1504:   } else {
1505:     info->fill_ratio_given  = 0;
1506:     info->fill_ratio_needed = 0;
1507:     info->factor_mallocs    = 0;
1508:   }
1509:   PetscFunctionReturn(PETSC_SUCCESS);
1510: }

1512: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1513: {
1514:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1516:   PetscFunctionBegin;
1517:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1518:   PetscFunctionReturn(PETSC_SUCCESS);
1519: }

1521: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1522: {
1523:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1524:   PetscInt         i, j, n, row, col, bs, mbs;
1525:   const PetscInt  *ai, *aj;
1526:   PetscReal        atmp;
1527:   const MatScalar *aa;
1528:   PetscScalar     *x;
1529:   PetscInt         ncols, brow, bcol, krow, kcol;

1531:   PetscFunctionBegin;
1532:   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1533:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1534:   bs  = A->rmap->bs;
1535:   aa  = a->a;
1536:   ai  = a->i;
1537:   aj  = a->j;
1538:   mbs = a->mbs;

1540:   PetscCall(VecSet(v, 0.0));
1541:   PetscCall(VecGetArray(v, &x));
1542:   PetscCall(VecGetLocalSize(v, &n));
1543:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1544:   for (i = 0; i < mbs; i++) {
1545:     ncols = ai[1] - ai[0];
1546:     ai++;
1547:     brow = bs * i;
1548:     for (j = 0; j < ncols; j++) {
1549:       bcol = bs * (*aj);
1550:       for (kcol = 0; kcol < bs; kcol++) {
1551:         col = bcol + kcol; /* col index */
1552:         for (krow = 0; krow < bs; krow++) {
1553:           atmp = PetscAbsScalar(*aa);
1554:           aa++;
1555:           row = brow + krow; /* row index */
1556:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1557:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1558:         }
1559:       }
1560:       aj++;
1561:     }
1562:   }
1563:   PetscCall(VecRestoreArray(v, &x));
1564:   PetscFunctionReturn(PETSC_SUCCESS);
1565: }

1567: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1568: {
1569:   PetscFunctionBegin;
1570:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1571:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1572:   PetscFunctionReturn(PETSC_SUCCESS);
1573: }

1575: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1576: {
1577:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1578:   PetscScalar       *z = c;
1579:   const PetscScalar *xb;
1580:   PetscScalar        x1;
1581:   const MatScalar   *v   = a->a, *vv;
1582:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1583: #if defined(PETSC_USE_COMPLEX)
1584:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1585: #else
1586:   const int aconj = 0;
1587: #endif

1589:   PetscFunctionBegin;
1590:   for (i = 0; i < mbs; i++) {
1591:     n = ii[1] - ii[0];
1592:     ii++;
1593:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1594:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1595:     jj = idx;
1596:     vv = v;
1597:     for (k = 0; k < cn; k++) {
1598:       idx = jj;
1599:       v   = vv;
1600:       for (j = 0; j < n; j++) {
1601:         xb = b + (*idx);
1602:         x1 = xb[0 + k * bm];
1603:         z[0 + k * cm] += v[0] * x1;
1604:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1605:         v += 1;
1606:         ++idx;
1607:       }
1608:     }
1609:     z += 1;
1610:   }
1611:   PetscFunctionReturn(PETSC_SUCCESS);
1612: }

1614: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1615: {
1616:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1617:   PetscScalar       *z = c;
1618:   const PetscScalar *xb;
1619:   PetscScalar        x1, x2;
1620:   const MatScalar   *v   = a->a, *vv;
1621:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1623:   PetscFunctionBegin;
1624:   for (i = 0; i < mbs; i++) {
1625:     n = ii[1] - ii[0];
1626:     ii++;
1627:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1628:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1629:     jj = idx;
1630:     vv = v;
1631:     for (k = 0; k < cn; k++) {
1632:       idx = jj;
1633:       v   = vv;
1634:       for (j = 0; j < n; j++) {
1635:         xb = b + 2 * (*idx);
1636:         x1 = xb[0 + k * bm];
1637:         x2 = xb[1 + k * bm];
1638:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1639:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1640:         if (*idx != i) {
1641:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1642:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1643:         }
1644:         v += 4;
1645:         ++idx;
1646:       }
1647:     }
1648:     z += 2;
1649:   }
1650:   PetscFunctionReturn(PETSC_SUCCESS);
1651: }

1653: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1654: {
1655:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1656:   PetscScalar       *z = c;
1657:   const PetscScalar *xb;
1658:   PetscScalar        x1, x2, x3;
1659:   const MatScalar   *v   = a->a, *vv;
1660:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1662:   PetscFunctionBegin;
1663:   for (i = 0; i < mbs; i++) {
1664:     n = ii[1] - ii[0];
1665:     ii++;
1666:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1667:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1668:     jj = idx;
1669:     vv = v;
1670:     for (k = 0; k < cn; k++) {
1671:       idx = jj;
1672:       v   = vv;
1673:       for (j = 0; j < n; j++) {
1674:         xb = b + 3 * (*idx);
1675:         x1 = xb[0 + k * bm];
1676:         x2 = xb[1 + k * bm];
1677:         x3 = xb[2 + k * bm];
1678:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1679:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1680:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1681:         if (*idx != i) {
1682:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1683:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1684:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1685:         }
1686:         v += 9;
1687:         ++idx;
1688:       }
1689:     }
1690:     z += 3;
1691:   }
1692:   PetscFunctionReturn(PETSC_SUCCESS);
1693: }

1695: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1696: {
1697:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1698:   PetscScalar       *z = c;
1699:   const PetscScalar *xb;
1700:   PetscScalar        x1, x2, x3, x4;
1701:   const MatScalar   *v   = a->a, *vv;
1702:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1704:   PetscFunctionBegin;
1705:   for (i = 0; i < mbs; i++) {
1706:     n = ii[1] - ii[0];
1707:     ii++;
1708:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1709:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1710:     jj = idx;
1711:     vv = v;
1712:     for (k = 0; k < cn; k++) {
1713:       idx = jj;
1714:       v   = vv;
1715:       for (j = 0; j < n; j++) {
1716:         xb = b + 4 * (*idx);
1717:         x1 = xb[0 + k * bm];
1718:         x2 = xb[1 + k * bm];
1719:         x3 = xb[2 + k * bm];
1720:         x4 = xb[3 + k * bm];
1721:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1722:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1723:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1724:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1725:         if (*idx != i) {
1726:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1727:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1728:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1729:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1730:         }
1731:         v += 16;
1732:         ++idx;
1733:       }
1734:     }
1735:     z += 4;
1736:   }
1737:   PetscFunctionReturn(PETSC_SUCCESS);
1738: }

1740: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1741: {
1742:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1743:   PetscScalar       *z = c;
1744:   const PetscScalar *xb;
1745:   PetscScalar        x1, x2, x3, x4, x5;
1746:   const MatScalar   *v   = a->a, *vv;
1747:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1749:   PetscFunctionBegin;
1750:   for (i = 0; i < mbs; i++) {
1751:     n = ii[1] - ii[0];
1752:     ii++;
1753:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1754:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1755:     jj = idx;
1756:     vv = v;
1757:     for (k = 0; k < cn; k++) {
1758:       idx = jj;
1759:       v   = vv;
1760:       for (j = 0; j < n; j++) {
1761:         xb = b + 5 * (*idx);
1762:         x1 = xb[0 + k * bm];
1763:         x2 = xb[1 + k * bm];
1764:         x3 = xb[2 + k * bm];
1765:         x4 = xb[3 + k * bm];
1766:         x5 = xb[4 + k * cm];
1767:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1768:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1769:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1770:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1771:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1772:         if (*idx != i) {
1773:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1774:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1775:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1776:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1777:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1778:         }
1779:         v += 25;
1780:         ++idx;
1781:       }
1782:     }
1783:     z += 5;
1784:   }
1785:   PetscFunctionReturn(PETSC_SUCCESS);
1786: }

1788: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1789: {
1790:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1791:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1792:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1793:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1794:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1795:   PetscBLASInt     bbs, bcn, bbm, bcm;
1796:   PetscScalar     *z = NULL;
1797:   PetscScalar     *c, *b;
1798:   const MatScalar *v;
1799:   const PetscInt  *idx, *ii;
1800:   PetscScalar      _DOne = 1.0;

1802:   PetscFunctionBegin;
1803:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1804:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1805:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1806:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1807:   b = bd->v;
1808:   PetscCall(MatZeroEntries(C));
1809:   PetscCall(MatDenseGetArray(C, &c));
1810:   switch (bs) {
1811:   case 1:
1812:     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1813:     break;
1814:   case 2:
1815:     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1816:     break;
1817:   case 3:
1818:     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1819:     break;
1820:   case 4:
1821:     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1822:     break;
1823:   case 5:
1824:     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1825:     break;
1826:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1827:     PetscCall(PetscBLASIntCast(bs, &bbs));
1828:     PetscCall(PetscBLASIntCast(cn, &bcn));
1829:     PetscCall(PetscBLASIntCast(bm, &bbm));
1830:     PetscCall(PetscBLASIntCast(cm, &bcm));
1831:     idx = a->j;
1832:     v   = a->a;
1833:     mbs = a->mbs;
1834:     ii  = a->i;
1835:     z   = c;
1836:     for (i = 0; i < mbs; i++) {
1837:       n = ii[1] - ii[0];
1838:       ii++;
1839:       for (j = 0; j < n; j++) {
1840:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1841:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1842:         v += bs2;
1843:       }
1844:       z += bs;
1845:     }
1846:   }
1847:   PetscCall(MatDenseRestoreArray(C, &c));
1848:   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1849:   PetscFunctionReturn(PETSC_SUCCESS);
1850: }