Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscmat_kokkos.hpp>
  4: #include <petscpkg_version.h>
  5: #include <petsc/private/petscimpl.h>
  6: #include <petsc/private/sfimpl.h>
  7: #include <petscsystypes.h>
  8: #include <petscerror.h>

 10: #include <Kokkos_Core.hpp>
 11: #include <KokkosBlas.hpp>
 12: #include <KokkosSparse_CrsMatrix.hpp>

 14: // To suppress compiler warnings:
 15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
 16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
 17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
 18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
 19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
 20: #include <KokkosSparse_spmv.hpp>
 21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()

 23: #include <KokkosSparse_spiluk.hpp>
 24: #include <KokkosSparse_sptrsv.hpp>
 25: #include <KokkosSparse_spgemm.hpp>
 26: #include <KokkosSparse_spadd.hpp>
 27: #include <KokkosBatched_LU_Decl.hpp>
 28: #include <KokkosBatched_InverseLU_Decl.hpp>

 30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 33:   #include <KokkosSparse_Utils.hpp>
 34: using KokkosSparse::sort_crs_matrix;
 35: using KokkosSparse::Impl::transpose_matrix;
 36: #else
 37:   #include <KokkosKernels_Sorting.hpp>
 38: using KokkosKernels::sort_crs_matrix;
 39: using KokkosKernels::Impl::transpose_matrix;
 40: #endif

 42: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 44: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 45:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 46:    In the latter case, it is important to set a_dual's sync state correctly.
 47:  */
 48: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 49: {
 50:   Mat_SeqAIJ       *aijseq;
 51:   Mat_SeqAIJKokkos *aijkok;

 53:   PetscFunctionBegin;
 54:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 55:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 57:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 58:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 60:   /* If aijkok does not exist, we just copy i, j to device.
 61:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 62:      In both cases, we build a new aijkok structure.
 63:   */
 64:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 65:     delete aijkok;
 66:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 67:     A->spptr = aijkok;
 68:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
 69:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
 70:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
 71:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
 72:   }
 73:   PetscFunctionReturn(PETSC_SUCCESS);
 74: }

 76: /* Sync CSR data to device if not yet */
 77: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 78: {
 79:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 81:   PetscFunctionBegin;
 82:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
 83:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
 84:   if (aijkok->a_dual.need_sync_device()) {
 85:     aijkok->a_dual.sync_device();
 86:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 87:     aijkok->hermitian_updated = PETSC_FALSE;
 88:   }
 89:   PetscFunctionReturn(PETSC_SUCCESS);
 90: }

 92: /* Mark the CSR data on device as modified */
 93: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 94: {
 95:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 97:   PetscFunctionBegin;
 98:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
 99:   aijkok->a_dual.clear_sync_state();
100:   aijkok->a_dual.modify_device();
101:   aijkok->transpose_updated = PETSC_FALSE;
102:   aijkok->hermitian_updated = PETSC_FALSE;
103:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
104:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }

108: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
109: {
110:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111:   auto              exec   = PetscGetKokkosExecutionSpace();

113:   PetscFunctionBegin;
114:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
115:   /* We do not expect one needs factors on host  */
116:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
117:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
118:   PetscCallCXX(aijkok->a_dual.sync_host(exec));
119:   PetscCallCXX(exec.fence());
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
124: {
125:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

127:   PetscFunctionBegin;
128:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
129:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
130:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
131:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
132:   */
133:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
134:     auto exec = PetscGetKokkosExecutionSpace();
135:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
136:     PetscCallCXX(exec.fence());
137:     *array = aijkok->a_dual.view_host().data();
138:   } else { /* Happens when calling MatSetValues on a newly created matrix */
139:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
140:   }
141:   PetscFunctionReturn(PETSC_SUCCESS);
142: }

144: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
145: {
146:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

148:   PetscFunctionBegin;
149:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
154: {
155:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

157:   PetscFunctionBegin;
158:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
159:     auto exec = PetscGetKokkosExecutionSpace();
160:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
161:     PetscCallCXX(exec.fence());
162:     *array = aijkok->a_dual.view_host().data();
163:   } else {
164:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
165:   }
166:   PetscFunctionReturn(PETSC_SUCCESS);
167: }

169: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170: {
171:   PetscFunctionBegin;
172:   *array = NULL;
173:   PetscFunctionReturn(PETSC_SUCCESS);
174: }

176: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177: {
178:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

180:   PetscFunctionBegin;
181:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182:     *array = aijkok->a_dual.view_host().data();
183:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
184:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
185:   }
186:   PetscFunctionReturn(PETSC_SUCCESS);
187: }

189: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
190: {
191:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

193:   PetscFunctionBegin;
194:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
195:     aijkok->a_dual.clear_sync_state();
196:     aijkok->a_dual.modify_host();
197:   }
198:   PetscFunctionReturn(PETSC_SUCCESS);
199: }

201: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
202: {
203:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

205:   PetscFunctionBegin;
206:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

208:   if (i) *i = aijkok->i_device_data();
209:   if (j) *j = aijkok->j_device_data();
210:   if (a) {
211:     aijkok->a_dual.sync_device();
212:     *a = aijkok->a_device_data();
213:   }
214:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
215:   PetscFunctionReturn(PETSC_SUCCESS);
216: }

218: /*
219:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

221:   Input Parameter:
222: .  A       - the MATSEQAIJKOKKOS matrix

224:   Output Parameters:
225: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
226: -  T_d    - the transpose on device, whose value array is allocated but not initialized
227: */
228: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
229: {
230:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
231:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
232:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
233:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
234:   MatRowMapType          *Ti = Ti_h.data();
235:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
236:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
237:   PetscInt               *Tj   = Tj_h.data();
238:   PetscInt               *perm = perm_h.data();
239:   PetscInt               *offset;

241:   PetscFunctionBegin;
242:   // Populate Ti
243:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
244:   Ti++;
245:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
246:   Ti--;
247:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

249:   // Populate Tj and the permutation array
250:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
251:   for (PetscInt i = 0; i < m; i++) {
252:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
253:       PetscInt r    = Aj[j];                       // row r of T
254:       PetscInt disp = Ti[r] + offset[r];

256:       Tj[disp]   = i; // col i of T
257:       perm[disp] = j;
258:       offset[r]++;
259:     }
260:   }
261:   PetscCall(PetscFree(offset));

263:   // Sort each row of T, along with the permutation array
264:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

266:   // Output perm and T on device
267:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
268:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
269:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
270:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
271:   PetscFunctionReturn(PETSC_SUCCESS);
272: }

274: // Generate the transpose on device and cache it internally
275: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
276: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
277: {
278:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
279:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
280:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
281:   KokkosCsrMatrix  &T = akok->csrmatT;

283:   PetscFunctionBegin;
284:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
285:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

287:   const auto &Aa = akok->a_dual.view_device();

289:   if (A->symmetric == PETSC_BOOL3_TRUE) {
290:     *csrmatT = akok->csrmat;
291:   } else {
292:     // See if we already have a cached transpose and its value is up to date
293:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
294:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
295:         const auto &perm = akok->transpose_perm; // get the permutation array
296:         auto       &Ta   = T.values;

298:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
299:       }
300:     } else { // Generate T of size n x m for the first time
301:       MatRowMapKokkosView perm;

303:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
304:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
305:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
306:     }
307:     akok->transpose_updated = PETSC_TRUE;
308:     *csrmatT                = akok->csrmatT;
309:   }
310:   PetscFunctionReturn(PETSC_SUCCESS);
311: }

313: // Generate the Hermitian on device and cache it internally
314: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
315: {
316:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
317:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
318:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
319:   KokkosCsrMatrix  &T = akok->csrmatH;

321:   PetscFunctionBegin;
322:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
323:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

325:   const auto &Aa = akok->a_dual.view_device();

327:   if (A->hermitian == PETSC_BOOL3_TRUE) {
328:     *csrmatH = akok->csrmat;
329:   } else {
330:     // See if we already have a cached hermitian and its value is up to date
331:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
332:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
333:         const auto &perm = akok->transpose_perm; // get the permutation array
334:         auto       &Ta   = T.values;

336:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
337:       }
338:     } else { // Generate T of size n x m for the first time
339:       MatRowMapKokkosView perm;

341:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
342:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
343:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
344:     }
345:     akok->hermitian_updated = PETSC_TRUE;
346:     *csrmatH                = akok->csrmatH;
347:   }
348:   PetscFunctionReturn(PETSC_SUCCESS);
349: }

351: /* y = A x */
352: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
353: {
354:   Mat_SeqAIJKokkos          *aijkok;
355:   ConstPetscScalarKokkosView xv;
356:   PetscScalarKokkosView      yv;

358:   PetscFunctionBegin;
359:   PetscCall(PetscLogGpuTimeBegin());
360:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
361:   PetscCall(VecGetKokkosView(xx, &xv));
362:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
363:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
364:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
365:   PetscCall(VecRestoreKokkosView(xx, &xv));
366:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
367:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
368:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
369:   PetscCall(PetscLogGpuTimeEnd());
370:   PetscFunctionReturn(PETSC_SUCCESS);
371: }

373: /* y = A^T x */
374: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
375: {
376:   Mat_SeqAIJKokkos          *aijkok;
377:   const char                *mode;
378:   ConstPetscScalarKokkosView xv;
379:   PetscScalarKokkosView      yv;
380:   KokkosCsrMatrix            csrmat;

382:   PetscFunctionBegin;
383:   PetscCall(PetscLogGpuTimeBegin());
384:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
385:   PetscCall(VecGetKokkosView(xx, &xv));
386:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
387:   if (A->form_explicit_transpose) {
388:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
389:     mode = "N";
390:   } else {
391:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
392:     csrmat = aijkok->csrmat;
393:     mode   = "T";
394:   }
395:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
396:   PetscCall(VecRestoreKokkosView(xx, &xv));
397:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
398:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
399:   PetscCall(PetscLogGpuTimeEnd());
400:   PetscFunctionReturn(PETSC_SUCCESS);
401: }

403: /* y = A^H x */
404: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405: {
406:   Mat_SeqAIJKokkos          *aijkok;
407:   const char                *mode;
408:   ConstPetscScalarKokkosView xv;
409:   PetscScalarKokkosView      yv;
410:   KokkosCsrMatrix            csrmat;

412:   PetscFunctionBegin;
413:   PetscCall(PetscLogGpuTimeBegin());
414:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
415:   PetscCall(VecGetKokkosView(xx, &xv));
416:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
417:   if (A->form_explicit_transpose) {
418:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
419:     mode = "N";
420:   } else {
421:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422:     csrmat = aijkok->csrmat;
423:     mode   = "C";
424:   }
425:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
426:   PetscCall(VecRestoreKokkosView(xx, &xv));
427:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
428:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
429:   PetscCall(PetscLogGpuTimeEnd());
430:   PetscFunctionReturn(PETSC_SUCCESS);
431: }

433: /* z = A x + y */
434: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
435: {
436:   Mat_SeqAIJKokkos          *aijkok;
437:   ConstPetscScalarKokkosView xv;
438:   PetscScalarKokkosView      zv;

440:   PetscFunctionBegin;
441:   PetscCall(PetscLogGpuTimeBegin());
442:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
443:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
444:   PetscCall(VecGetKokkosView(xx, &xv));
445:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
446:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
447:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
448:   PetscCall(VecRestoreKokkosView(xx, &xv));
449:   PetscCall(VecRestoreKokkosView(zz, &zv));
450:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
451:   PetscCall(PetscLogGpuTimeEnd());
452:   PetscFunctionReturn(PETSC_SUCCESS);
453: }

455: /* z = A^T x + y */
456: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
457: {
458:   Mat_SeqAIJKokkos          *aijkok;
459:   const char                *mode;
460:   ConstPetscScalarKokkosView xv;
461:   PetscScalarKokkosView      zv;
462:   KokkosCsrMatrix            csrmat;

464:   PetscFunctionBegin;
465:   PetscCall(PetscLogGpuTimeBegin());
466:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
467:   if (zz != yy) PetscCall(VecCopy(yy, zz));
468:   PetscCall(VecGetKokkosView(xx, &xv));
469:   PetscCall(VecGetKokkosView(zz, &zv));
470:   if (A->form_explicit_transpose) {
471:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
472:     mode = "N";
473:   } else {
474:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
475:     csrmat = aijkok->csrmat;
476:     mode   = "T";
477:   }
478:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
479:   PetscCall(VecRestoreKokkosView(xx, &xv));
480:   PetscCall(VecRestoreKokkosView(zz, &zv));
481:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
482:   PetscCall(PetscLogGpuTimeEnd());
483:   PetscFunctionReturn(PETSC_SUCCESS);
484: }

486: /* z = A^H x + y */
487: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
488: {
489:   Mat_SeqAIJKokkos          *aijkok;
490:   const char                *mode;
491:   ConstPetscScalarKokkosView xv;
492:   PetscScalarKokkosView      zv;
493:   KokkosCsrMatrix            csrmat;

495:   PetscFunctionBegin;
496:   PetscCall(PetscLogGpuTimeBegin());
497:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
498:   if (zz != yy) PetscCall(VecCopy(yy, zz));
499:   PetscCall(VecGetKokkosView(xx, &xv));
500:   PetscCall(VecGetKokkosView(zz, &zv));
501:   if (A->form_explicit_transpose) {
502:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
503:     mode = "N";
504:   } else {
505:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
506:     csrmat = aijkok->csrmat;
507:     mode   = "C";
508:   }
509:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
510:   PetscCall(VecRestoreKokkosView(xx, &xv));
511:   PetscCall(VecRestoreKokkosView(zz, &zv));
512:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
513:   PetscCall(PetscLogGpuTimeEnd());
514:   PetscFunctionReturn(PETSC_SUCCESS);
515: }

517: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
518: {
519:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

521:   PetscFunctionBegin;
522:   switch (op) {
523:   case MAT_FORM_EXPLICIT_TRANSPOSE:
524:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
525:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
526:     A->form_explicit_transpose = flg;
527:     break;
528:   default:
529:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
530:     break;
531:   }
532:   PetscFunctionReturn(PETSC_SUCCESS);
533: }

535: /* Depending on reuse, either build a new mat, or use the existing mat */
536: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
537: {
538:   Mat_SeqAIJ *aseq;

540:   PetscFunctionBegin;
541:   PetscCall(PetscKokkosInitializeCheck());
542:   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
543:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
544:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
545:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
546:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
547:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
548:     PetscCall(PetscFree(A->defaultvectype));
549:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
550:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
551:     PetscCall(MatSetOps_SeqAIJKokkos(A));
552:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
553:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
554:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
555:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
556:     }
557:   }
558:   PetscFunctionReturn(PETSC_SUCCESS);
559: }

561: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
562:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
563:  */
564: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
565: {
566:   Mat_SeqAIJ       *bseq;
567:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
568:   Mat               mat;

570:   PetscFunctionBegin;
571:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
572:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
573:   mat = *B;
574:   if (A->assembled) {
575:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
576:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
577:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
578:     /* Now copy values to B if needed */
579:     if (dupOption == MAT_COPY_VALUES) {
580:       if (akok->a_dual.need_sync_device()) {
581:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
582:         bkok->a_dual.modify_host();
583:       } else { /* If device has the latest data, we only copy data on device */
584:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
585:         bkok->a_dual.modify_device();
586:       }
587:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
588:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
589:       bkok->a_dual.modify_host();
590:     }
591:     mat->spptr = bkok;
592:   }

594:   PetscCall(PetscFree(mat->defaultvectype));
595:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
596:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
597:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
598:   PetscFunctionReturn(PETSC_SUCCESS);
599: }

601: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
602: {
603:   Mat               At;
604:   KokkosCsrMatrix   internT;
605:   Mat_SeqAIJKokkos *atkok, *bkok;

607:   PetscFunctionBegin;
608:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
609:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
610:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
611:     /* Deep copy internT, as we want to isolate the internal transpose */
612:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
613:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
614:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
615:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
616:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
617:     if ((*B)->assembled) {
618:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
619:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
620:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
621:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
622:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
623:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
624:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
625:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
626:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
627:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
628:   }
629:   PetscFunctionReturn(PETSC_SUCCESS);
630: }

632: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
633: {
634:   Mat_SeqAIJKokkos *aijkok;

636:   PetscFunctionBegin;
637:   if (A->factortype == MAT_FACTOR_NONE) {
638:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
639:     delete aijkok;
640:   } else {
641:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
642:   }
643:   A->spptr = NULL;
644:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
645:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
646:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
647: #if defined(PETSC_HAVE_HYPRE)
648:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
649: #endif
650:   PetscCall(MatDestroy_SeqAIJ(A));
651:   PetscFunctionReturn(PETSC_SUCCESS);
652: }

654: /*MC
655:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

657:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

659:    Options Database Key:
660: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

662:   Level: beginner

664: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
665: M*/
666: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
667: {
668:   PetscFunctionBegin;
669:   PetscCall(PetscKokkosInitializeCheck());
670:   PetscCall(MatCreate_SeqAIJ(A));
671:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
672:   PetscFunctionReturn(PETSC_SUCCESS);
673: }

675: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
676: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
677: {
678:   Mat_SeqAIJ         *a, *b;
679:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
680:   MatScalarKokkosView aa, ba, ca;
681:   MatRowMapKokkosView ai, bi, ci;
682:   MatColIdxKokkosView aj, bj, cj;
683:   PetscInt            m, n, nnz, aN;

685:   PetscFunctionBegin;
688:   PetscAssertPointer(C, 4);
689:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
690:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
691:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
692:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

694:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
695:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
696:   a    = static_cast<Mat_SeqAIJ *>(A->data);
697:   b    = static_cast<Mat_SeqAIJ *>(B->data);
698:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
699:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
700:   aa   = akok->a_dual.view_device();
701:   ai   = akok->i_dual.view_device();
702:   ba   = bkok->a_dual.view_device();
703:   bi   = bkok->i_dual.view_device();
704:   m    = A->rmap->n; /* M, N and nnz of C */
705:   n    = A->cmap->n + B->cmap->n;
706:   nnz  = a->nz + b->nz;
707:   aN   = A->cmap->n; /* N of A */
708:   if (reuse == MAT_INITIAL_MATRIX) {
709:     aj           = akok->j_dual.view_device();
710:     bj           = bkok->j_dual.view_device();
711:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
712:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
713:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
714:     ca           = ca_dual.view_device();
715:     ci           = ci_dual.view_device();
716:     cj           = cj_dual.view_device();

718:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
719:     Kokkos::parallel_for(
720:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
721:         PetscInt i       = t.league_rank(); /* row i */
722:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

724:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
725:                                                    ci(i) = coffset;
726:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
727:         });

729:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
730:           if (k < alen) {
731:             ca(coffset + k) = aa(ai(i) + k);
732:             cj(coffset + k) = aj(ai(i) + k);
733:           } else {
734:             ca(coffset + k) = ba(bi(i) + k - alen);
735:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
736:           }
737:         });
738:       });
739:     ca_dual.modify_device();
740:     ci_dual.modify_device();
741:     cj_dual.modify_device();
742:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
743:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
744:   } else if (reuse == MAT_REUSE_MATRIX) {
746:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
747:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
748:     ca   = ckok->a_dual.view_device();
749:     ci   = ckok->i_dual.view_device();

751:     Kokkos::parallel_for(
752:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753:         PetscInt i    = t.league_rank(); /* row i */
754:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
756:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
757:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
758:         });
759:       });
760:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
761:   }
762:   PetscFunctionReturn(PETSC_SUCCESS);
763: }

765: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
766: {
767:   PetscFunctionBegin;
768:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
769:   PetscFunctionReturn(PETSC_SUCCESS);
770: }

772: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
773: {
774:   Mat_Product                 *product = C->product;
775:   Mat                          A, B;
776:   bool                         transA, transB; /* use bool, since KK needs this type */
777:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
778:   Mat_SeqAIJ                  *c;
779:   MatProductData_SeqAIJKokkos *pdata;
780:   KokkosCsrMatrix              csrmatA, csrmatB;

782:   PetscFunctionBegin;
783:   MatCheckProduct(C, 1);
784:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
785:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

787:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
788:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
789:   // we still do numeric.
790:   if (pdata->reusesym) { // numeric reuses results from symbolic
791:     pdata->reusesym = PETSC_FALSE;
792:     PetscFunctionReturn(PETSC_SUCCESS);
793:   }

795:   switch (product->type) {
796:   case MATPRODUCT_AB:
797:     transA = false;
798:     transB = false;
799:     break;
800:   case MATPRODUCT_AtB:
801:     transA = true;
802:     transB = false;
803:     break;
804:   case MATPRODUCT_ABt:
805:     transA = false;
806:     transB = true;
807:     break;
808:   default:
809:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
810:   }

812:   A = product->A;
813:   B = product->B;
814:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
815:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
816:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
817:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
818:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

820:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

822:   csrmatA = akok->csrmat;
823:   csrmatB = bkok->csrmat;

825:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
826:   if (transA) {
827:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
828:     transA = false;
829:   }

831:   if (transB) {
832:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
833:     transB = false;
834:   }
835:   PetscCall(PetscLogGpuTimeBegin());
836:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
837: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
838:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
839:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
840: #endif

842:   PetscCall(PetscLogGpuTimeEnd());
843:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
844:   /* shorter version of MatAssemblyEnd_SeqAIJ */
845:   c = (Mat_SeqAIJ *)C->data;
846:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
847:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
848:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
849:   c->reallocs         = 0;
850:   C->info.mallocs     = 0;
851:   C->info.nz_unneeded = 0;
852:   C->assembled = C->was_assembled = PETSC_TRUE;
853:   C->num_ass++;
854:   PetscFunctionReturn(PETSC_SUCCESS);
855: }

857: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
858: {
859:   Mat_Product                 *product = C->product;
860:   MatProductType               ptype;
861:   Mat                          A, B;
862:   bool                         transA, transB;
863:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
864:   MatProductData_SeqAIJKokkos *pdata;
865:   MPI_Comm                     comm;
866:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

868:   PetscFunctionBegin;
869:   MatCheckProduct(C, 1);
870:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
871:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
872:   A = product->A;
873:   B = product->B;
874:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
875:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
876:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
877:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
878:   csrmatA = akok->csrmat;
879:   csrmatB = bkok->csrmat;

881:   ptype = product->type;
882:   // Take advantage of the symmetry if true
883:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
884:     ptype                                          = MATPRODUCT_AB;
885:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
886:   }
887:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
888:     ptype                                          = MATPRODUCT_AB;
889:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
890:   }

892:   switch (ptype) {
893:   case MATPRODUCT_AB:
894:     transA = false;
895:     transB = false;
896:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
897:     break;
898:   case MATPRODUCT_AtB:
899:     transA = true;
900:     transB = false;
901:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
902:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
903:     break;
904:   case MATPRODUCT_ABt:
905:     transA = false;
906:     transB = true;
907:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
908:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
909:     break;
910:   default:
911:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
912:   }
913:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
914:   pdata->reusesym = product->api_user;

916:   /* TODO: add command line options to select spgemm algorithms */
917:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

919:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
920: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
921:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
922:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
923:   #endif
924: #endif
925:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

927:   PetscCall(PetscLogGpuTimeBegin());
928:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
929:   if (transA) {
930:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
931:     transA = false;
932:   }

934:   if (transB) {
935:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
936:     transB = false;
937:   }

939:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
940:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
941:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
942:     calling new Mat_SeqAIJKokkos().
943:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
944:   */
945:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
946: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
947:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
948:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
949:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
950: #endif
951:   PetscCall(PetscLogGpuTimeEnd());

953:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
954:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
955:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
956:   PetscFunctionReturn(PETSC_SUCCESS);
957: }

959: /* handles sparse matrix matrix ops */
960: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
961: {
962:   Mat_Product *product = mat->product;
963:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

965:   PetscFunctionBegin;
966:   MatCheckProduct(mat, 1);
967:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
968:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
969:   if (Biskok && Ciskok) {
970:     switch (product->type) {
971:     case MATPRODUCT_AB:
972:     case MATPRODUCT_AtB:
973:     case MATPRODUCT_ABt:
974:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
975:       break;
976:     case MATPRODUCT_PtAP:
977:     case MATPRODUCT_RARt:
978:     case MATPRODUCT_ABC:
979:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
980:       break;
981:     default:
982:       break;
983:     }
984:   } else { /* fallback for AIJ */
985:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
986:   }
987:   PetscFunctionReturn(PETSC_SUCCESS);
988: }

990: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
991: {
992:   Mat_SeqAIJKokkos *aijkok;

994:   PetscFunctionBegin;
995:   PetscCall(PetscLogGpuTimeBegin());
996:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
997:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
998:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
999:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1000:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1001:   PetscCall(PetscLogGpuTimeEnd());
1002:   PetscFunctionReturn(PETSC_SUCCESS);
1003: }

1005: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1006: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1007: {
1008:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

1010:   PetscFunctionBegin;
1011:   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1012:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

1014:     PetscCall(PetscLogGpuTimeBegin());
1015:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1016:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1017:     const auto &Aa     = aijkok->a_dual.view_device();
1018:     const auto &Adiag  = aijkok->diag_dual.view_device();
1019:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1020:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021:     PetscCall(PetscLogGpuFlops(n));
1022:     PetscCall(PetscLogGpuTimeEnd());
1023:   } else { // need reassembly, very slow!
1024:     PetscCall(MatShift_Basic(A, a));
1025:   }
1026:   PetscFunctionReturn(PETSC_SUCCESS);
1027: }

1029: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1030: {
1031:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1033:   PetscFunctionBegin;
1034:   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1035:     ConstPetscScalarKokkosView dv;
1036:     PetscInt                   n, nv;

1038:     PetscCall(PetscLogGpuTimeBegin());
1039:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1040:     PetscCall(VecGetKokkosView(D, &dv));
1041:     PetscCall(VecGetLocalSize(D, &nv));
1042:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1043:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1045:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1046:     const auto &Aa     = aijkok->a_dual.view_device();
1047:     const auto &Adiag  = aijkok->diag_dual.view_device();
1048:     PetscCallCXX(Kokkos::parallel_for(
1049:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1050:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1051:         else Aa(Adiag(i)) += dv(i);
1052:       }));
1053:     PetscCall(VecRestoreKokkosView(D, &dv));
1054:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1055:     PetscCall(PetscLogGpuFlops(n));
1056:     PetscCall(PetscLogGpuTimeEnd());
1057:   } else { // need reassembly, very slow!
1058:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1059:   }
1060:   PetscFunctionReturn(PETSC_SUCCESS);
1061: }

1063: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1064: {
1065:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1066:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1067:   ConstPetscScalarKokkosView lv, rv;

1069:   PetscFunctionBegin;
1070:   PetscCall(PetscLogGpuTimeBegin());
1071:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1072:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1073:   const auto &Aa     = aijkok->a_dual.view_device();
1074:   const auto &Ai     = aijkok->i_dual.view_device();
1075:   const auto &Aj     = aijkok->j_dual.view_device();
1076:   if (ll) {
1077:     PetscCall(VecGetLocalSize(ll, &m));
1078:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1079:     PetscCall(VecGetKokkosView(ll, &lv));
1080:     PetscCallCXX(Kokkos::parallel_for( // for each row
1081:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1082:         PetscInt i   = t.league_rank(); // row i
1083:         PetscInt len = Ai(i + 1) - Ai(i);
1084:         // scale entries on the row
1085:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1086:       }));
1087:     PetscCall(VecRestoreKokkosView(ll, &lv));
1088:     PetscCall(PetscLogGpuFlops(nz));
1089:   }
1090:   if (rr) {
1091:     PetscCall(VecGetLocalSize(rr, &n));
1092:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1093:     PetscCall(VecGetKokkosView(rr, &rv));
1094:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1095:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1096:     PetscCall(VecRestoreKokkosView(rr, &lv));
1097:     PetscCall(PetscLogGpuFlops(nz));
1098:   }
1099:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1100:   PetscCall(PetscLogGpuTimeEnd());
1101:   PetscFunctionReturn(PETSC_SUCCESS);
1102: }

1104: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1105: {
1106:   Mat_SeqAIJKokkos *aijkok;

1108:   PetscFunctionBegin;
1109:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1110:   if (aijkok) { /* Only zero the device if data is already there */
1111:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1112:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1113:   } else { /* Might be preallocated but not assembled */
1114:     PetscCall(MatZeroEntries_SeqAIJ(A));
1115:   }
1116:   PetscFunctionReturn(PETSC_SUCCESS);
1117: }

1119: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1120: {
1121:   Mat_SeqAIJKokkos     *aijkok;
1122:   PetscInt              n;
1123:   PetscScalarKokkosView xv;

1125:   PetscFunctionBegin;
1126:   PetscCall(VecGetLocalSize(x, &n));
1127:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1128:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1130:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1131:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1133:   const auto &Aa    = aijkok->a_dual.view_device();
1134:   const auto &Ai    = aijkok->i_dual.view_device();
1135:   const auto &Adiag = aijkok->diag_dual.view_device();

1137:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1138:   Kokkos::parallel_for(
1139:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1140:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1141:       else xv(i) = 0;
1142:     });
1143:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1144:   PetscFunctionReturn(PETSC_SUCCESS);
1145: }

1147: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1148: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1149: {
1150:   Mat_SeqAIJKokkos *aijkok;

1152:   PetscFunctionBegin;
1154:   PetscAssertPointer(kv, 2);
1155:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1157:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1158:   *kv    = aijkok->a_dual.view_device();
1159:   PetscFunctionReturn(PETSC_SUCCESS);
1160: }

1162: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1163: {
1164:   PetscFunctionBegin;
1166:   PetscAssertPointer(kv, 2);
1167:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1168:   PetscFunctionReturn(PETSC_SUCCESS);
1169: }

1171: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1172: {
1173:   Mat_SeqAIJKokkos *aijkok;

1175:   PetscFunctionBegin;
1177:   PetscAssertPointer(kv, 2);
1178:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1180:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1181:   *kv    = aijkok->a_dual.view_device();
1182:   PetscFunctionReturn(PETSC_SUCCESS);
1183: }

1185: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1186: {
1187:   PetscFunctionBegin;
1189:   PetscAssertPointer(kv, 2);
1190:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1192:   PetscFunctionReturn(PETSC_SUCCESS);
1193: }

1195: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1196: {
1197:   Mat_SeqAIJKokkos *aijkok;

1199:   PetscFunctionBegin;
1201:   PetscAssertPointer(kv, 2);
1202:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1203:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1204:   *kv    = aijkok->a_dual.view_device();
1205:   PetscFunctionReturn(PETSC_SUCCESS);
1206: }

1208: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1209: {
1210:   PetscFunctionBegin;
1212:   PetscAssertPointer(kv, 2);
1213:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1214:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1215:   PetscFunctionReturn(PETSC_SUCCESS);
1216: }

1218: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1219: {
1220:   Mat_SeqAIJKokkos *akok;

1222:   PetscFunctionBegin;
1223:   auto exec = PetscGetKokkosExecutionSpace();
1224:   // Create host copies of the input aij
1225:   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1226:   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1227:   // Don't copy the vals to the host now
1228:   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);

1230:   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1231:   // Note we have modified device data so it will copy lazily
1232:   a_dual.modify_device();
1233:   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1234:   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);

1236:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1237:   PetscCall(MatCreate(comm, A));
1238:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1239:   PetscFunctionReturn(PETSC_SUCCESS);
1240: }

1242: /* Computes Y += alpha X */
1243: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1244: {
1245:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1246:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1247:   ConstMatScalarKokkosView Xa;
1248:   MatScalarKokkosView      Ya;
1249:   auto                     exec = PetscGetKokkosExecutionSpace();

1251:   PetscFunctionBegin;
1252:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1253:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1254:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1255:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1256:   PetscCall(PetscLogGpuTimeBegin());

1258:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1259:     PetscBool e;
1260:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1261:     if (e) {
1262:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1263:       if (e) pattern = SAME_NONZERO_PATTERN;
1264:     }
1265:   }

1267:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1268:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1269:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1270:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1271:   */
1272:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1273:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1274:   Xa   = xkok->a_dual.view_device();
1275:   Ya   = ykok->a_dual.view_device();

1277:   if (pattern == SAME_NONZERO_PATTERN) {
1278:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1279:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1280:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1281:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1282:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1284:     Kokkos::parallel_for(
1285:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1286:         PetscInt i = t.league_rank(); // row i
1287:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1288:           // Only one thread works in a team
1289:           PetscInt p, q = Yi(i);
1290:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1291:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1292:             if (Xj(p) == Yj(q)) {                        // Found it
1293:               Ya(q) += alpha * Xa(p);
1294:               q++;
1295:             } else {
1296:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1297:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1298: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1299:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1300: #else
1301:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1302: #endif
1303:             }
1304:           }
1305:         });
1306:       });
1307:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1308:   } else { // different nonzero patterns
1309:     Mat             Z;
1310:     KokkosCsrMatrix zcsr;
1311:     KernelHandle    kh;
1312:     kh.create_spadd_handle(true); // X, Y are sorted
1313:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1314:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1315:     zkok = new Mat_SeqAIJKokkos(zcsr);
1316:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1317:     PetscCall(MatHeaderReplace(Y, &Z));
1318:     kh.destroy_spadd_handle();
1319:   }
1320:   PetscCall(PetscLogGpuTimeEnd());
1321:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1322:   PetscFunctionReturn(PETSC_SUCCESS);
1323: }

1325: struct MatCOOStruct_SeqAIJKokkos {
1326:   PetscCount           n;
1327:   PetscCount           Atot;
1328:   PetscInt             nz;
1329:   PetscCountKokkosView jmap;
1330:   PetscCountKokkosView perm;

1332:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1333:   {
1334:     nz   = coo_h->nz;
1335:     n    = coo_h->n;
1336:     Atot = coo_h->Atot;
1337:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1338:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1339:   }
1340: };

1342: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1343: {
1344:   PetscFunctionBegin;
1345:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1346:   PetscFunctionReturn(PETSC_SUCCESS);
1347: }

1349: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1350: {
1351:   Mat_SeqAIJKokkos          *akok;
1352:   Mat_SeqAIJ                *aseq;
1353:   PetscContainer             container_h;
1354:   MatCOOStruct_SeqAIJ       *coo_h;
1355:   MatCOOStruct_SeqAIJKokkos *coo_d;

1357:   PetscFunctionBegin;
1358:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1359:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1360:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1361:   delete akok;
1362:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1363:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1365:   // Copy the COO struct to device
1366:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1367:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1368:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1370:   // Put the COO struct in a container and then attach that to the matrix
1371:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1372:   PetscFunctionReturn(PETSC_SUCCESS);
1373: }

1375: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1376: {
1377:   MatScalarKokkosView        Aa;
1378:   ConstMatScalarKokkosView   kv;
1379:   PetscMemType               memtype;
1380:   PetscContainer             container;
1381:   MatCOOStruct_SeqAIJKokkos *coo;

1383:   PetscFunctionBegin;
1384:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1385:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1387:   const auto &n    = coo->n;
1388:   const auto &Annz = coo->nz;
1389:   const auto &jmap = coo->jmap;
1390:   const auto &perm = coo->perm;

1392:   PetscCall(PetscGetMemType(v, &memtype));
1393:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1394:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1395:   } else {
1396:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1397:   }

1399:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1400:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1402:   PetscCall(PetscLogGpuTimeBegin());
1403:   Kokkos::parallel_for(
1404:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1405:       PetscScalar sum = 0.0;
1406:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1407:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1408:     });
1409:   PetscCall(PetscLogGpuTimeEnd());

1411:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1412:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1413:   PetscFunctionReturn(PETSC_SUCCESS);
1414: }

1416: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1417: {
1418:   PetscFunctionBegin;
1419:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1420:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1421:   B->offloadmask = PETSC_OFFLOAD_CPU;
1422:   PetscFunctionReturn(PETSC_SUCCESS);
1423: }

1425: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1426: {
1427:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1429:   PetscFunctionBegin;
1430:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1431:   A->boundtocpu  = PETSC_FALSE;

1433:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1434:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1435:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1436:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1437:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1438:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1439:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1440:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1441:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1442:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1443:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1444:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1445:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1446:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1447:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1448:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1449:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1450:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1451:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1452:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1453:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1454:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1455:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1456:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1457:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1458:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1459:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1461:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1462:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1463: #if defined(PETSC_HAVE_HYPRE)
1464:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1465: #endif
1466:   PetscFunctionReturn(PETSC_SUCCESS);
1467: }

1469: /*
1470:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1472:   Input Parameters:
1473: +  A       - the MATSEQAIJKOKKOS matrix
1474: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1475: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1476: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1477: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1479:   Output Parameter:
1480: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1481: */
1482: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1483: {
1484:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1485:   PetscInt          nblocks = bs.extent(0) - 1;

1487:   PetscFunctionBegin;
1488:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1490:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1491:   auto Aa    = akok->a_dual.view_device();
1492:   auto Ai    = akok->i_dual.view_device();
1493:   auto Aj    = akok->j_dual.view_device();
1494:   auto Adiag = akok->diag_dual.view_device();
1495:   // TODO: how to tune the team size?
1496: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1497:   auto ts = Kokkos::AUTO();
1498: #else
1499:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1500: #endif
1501:   PetscCallCXX(Kokkos::parallel_for(
1502:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1503:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1504:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1505:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1506:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1507:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1509:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1510:         PetscInt i = rstart + r;                                                            // i-th row in A

1512:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1513:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1515:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1516:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1517:               B(r, c) = 0.0;
1518:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1519:               B(r, c) = Aa(first + c);
1520:             } else { // this entry does not show up in the CSR
1521:               B(r, c) = 0.0;
1522:             }
1523:           }
1524:         } else { // rare case that the diagonal does not exist
1525:           const PetscInt begin = Ai(i);
1526:           const PetscInt end   = Ai(i + 1);
1527:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1528:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1529:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1530:             else if (Aj(j) >= rstart + m) break;
1531:           }
1532:         }
1533:       });

1535:       // LU-decompose B (w/o pivoting) and then invert B
1536:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1537:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1538:     }));
1539:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1540:   PetscFunctionReturn(PETSC_SUCCESS);
1541: }

1543: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1544: {
1545:   Mat_SeqAIJ *aseq;
1546:   PetscInt    i, m, n;
1547:   auto        exec = PetscGetKokkosExecutionSpace();

1549:   PetscFunctionBegin;
1550:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1552:   m = akok->nrows();
1553:   n = akok->ncols();
1554:   PetscCall(MatSetSizes(A, m, n, m, n));
1555:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1557:   /* Set up data structures of A as a MATSEQAIJ */
1558:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1559:   aseq = (Mat_SeqAIJ *)A->data;

1561:   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1562:   PetscCallCXX(akok->j_dual.sync_host(exec));
1563:   PetscCallCXX(exec.fence());

1565:   aseq->i       = akok->i_host_data();
1566:   aseq->j       = akok->j_host_data();
1567:   aseq->a       = akok->a_host_data();
1568:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1569:   aseq->free_a  = PETSC_FALSE;
1570:   aseq->free_ij = PETSC_FALSE;
1571:   aseq->nz      = akok->nnz();
1572:   aseq->maxnz   = aseq->nz;

1574:   PetscCall(PetscMalloc1(m, &aseq->imax));
1575:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1576:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1578:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1579:   akok->nonzerostate = A->nonzerostate;
1580:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1581:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1582:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1583:   PetscFunctionReturn(PETSC_SUCCESS);
1584: }

1586: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1587: {
1588:   PetscFunctionBegin;
1589:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1590:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1591:   PetscFunctionReturn(PETSC_SUCCESS);
1592: }

1594: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1595: {
1596:   Mat_SeqAIJKokkos *akok;

1598:   PetscFunctionBegin;
1599:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1600:   PetscCall(MatCreate(comm, A));
1601:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1602:   PetscFunctionReturn(PETSC_SUCCESS);
1603: }

1605: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1607:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1608:  */
1609: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1610: {
1611:   PetscFunctionBegin;
1612:   PetscCall(MatCreate(comm, A));
1613:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1614:   PetscFunctionReturn(PETSC_SUCCESS);
1615: }

1617: /*@C
1618:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1619:   (the default parallel PETSc format). This matrix will ultimately be handled by
1620:   Kokkos for calculations.

1622:   Collective

1624:   Input Parameters:
1625: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1626: . m    - number of rows
1627: . n    - number of columns
1628: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1629: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1631:   Output Parameter:
1632: . A - the matrix

1634:   Level: intermediate

1636:   Notes:
1637:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1638:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1639:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1641:   The AIJ format, also called
1642:   compressed row storage, is fully compatible with standard Fortran
1643:   storage.  That is, the stored row and column indices can begin at
1644:   either one (as in Fortran) or zero.

1646:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1647:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1648:   allocation.

1650: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1651: @*/
1652: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1653: {
1654:   PetscFunctionBegin;
1655:   PetscCall(PetscKokkosInitializeCheck());
1656:   PetscCall(MatCreate(comm, A));
1657:   PetscCall(MatSetSizes(*A, m, n, m, n));
1658:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1659:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1660:   PetscFunctionReturn(PETSC_SUCCESS);
1661: }

1663: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1664: {
1665:   PetscFunctionBegin;
1666:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1667:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1668:   PetscFunctionReturn(PETSC_SUCCESS);
1669: }

1671: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1672: {
1673:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1675:   PetscFunctionBegin;
1676:   if (!factors->sptrsv_symbolic_completed) {
1677:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1678:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1679:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1680:   }
1681:   PetscFunctionReturn(PETSC_SUCCESS);
1682: }

1684: /* Check if we need to update factors etc for transpose solve */
1685: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1686: {
1687:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1688:   MatColIdxType               n       = A->rmap->n;

1690:   PetscFunctionBegin;
1691:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1692:     /* Update L^T and do sptrsv symbolic */
1693:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1694:     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1695:     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));

1697:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1698:                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);

1700:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1701:       We have to sort the indices, until KK provides finer control options.
1702:     */
1703:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);

1705:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);

1707:     /* Update U^T and do sptrsv symbolic */
1708:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1709:     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1710:     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));

1712:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1713:                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);

1715:     /* Sort indices. See comments above */
1716:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);

1718:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1719:     factors->transpose_updated = PETSC_TRUE;
1720:   }
1721:   PetscFunctionReturn(PETSC_SUCCESS);
1722: }

1724: /* Solve Ax = b, with A = LU */
1725: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1726: {
1727:   ConstPetscScalarKokkosView  bv;
1728:   PetscScalarKokkosView       xv;
1729:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1731:   PetscFunctionBegin;
1732:   PetscCall(PetscLogGpuTimeBegin());
1733:   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1734:   PetscCall(VecGetKokkosView(b, &bv));
1735:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1736:   /* Solve L tmpv = b */
1737:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1738:   /* Solve Ux = tmpv */
1739:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1740:   PetscCall(VecRestoreKokkosView(b, &bv));
1741:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1742:   PetscCall(PetscLogGpuTimeEnd());
1743:   PetscFunctionReturn(PETSC_SUCCESS);
1744: }

1746: /* Solve A^T x = b, where A^T = U^T L^T */
1747: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1748: {
1749:   ConstPetscScalarKokkosView  bv;
1750:   PetscScalarKokkosView       xv;
1751:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1753:   PetscFunctionBegin;
1754:   PetscCall(PetscLogGpuTimeBegin());
1755:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1756:   PetscCall(VecGetKokkosView(b, &bv));
1757:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1758:   /* Solve U^T tmpv = b */
1759:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);

1761:   /* Solve L^T x = tmpv */
1762:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1763:   PetscCall(VecRestoreKokkosView(b, &bv));
1764:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1765:   PetscCall(PetscLogGpuTimeEnd());
1766:   PetscFunctionReturn(PETSC_SUCCESS);
1767: }

1769: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1770: {
1771:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1772:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1773:   PetscInt                    fill_lev = info->levels;

1775:   PetscFunctionBegin;
1776:   PetscCall(PetscLogGpuTimeBegin());
1777:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

1779:   auto a_d = aijkok->a_dual.view_device();
1780:   auto i_d = aijkok->i_dual.view_device();
1781:   auto j_d = aijkok->j_dual.view_device();

1783:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);

1785:   B->assembled              = PETSC_TRUE;
1786:   B->preallocated           = PETSC_TRUE;
1787:   B->ops->solve             = MatSolve_SeqAIJKokkos;
1788:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1789:   B->ops->matsolve          = NULL;
1790:   B->ops->matsolvetranspose = NULL;
1791:   B->offloadmask            = PETSC_OFFLOAD_GPU;

1793:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1794:   factors->transpose_updated         = PETSC_FALSE;
1795:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1796:   /* TODO: log flops, but how to know that? */
1797:   PetscCall(PetscLogGpuTimeEnd());
1798:   PetscFunctionReturn(PETSC_SUCCESS);
1799: }

1801: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1802: {
1803:   Mat_SeqAIJKokkos           *aijkok;
1804:   Mat_SeqAIJ                 *b;
1805:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1806:   PetscInt                    fill_lev = info->levels;
1807:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1808:   PetscInt                    n        = A->rmap->n;

1810:   PetscFunctionBegin;
1811:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1812:   /* Rebuild factors */
1813:   if (factors) {
1814:     factors->Destroy();
1815:   } /* Destroy the old if it exists */
1816:   else {
1817:     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1818:   }

1820:   /* Create a spiluk handle and then do symbolic factorization */
1821:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1822:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

1824:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1826:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1827:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1828:   Kokkos::realloc(factors->iU_d, n + 1);
1829:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

1831:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1832:   auto i_d = aijkok->i_dual.view_device();
1833:   auto j_d = aijkok->j_dual.view_device();
1834:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1835:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1837:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1838:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1839:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1840:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

1842:   /* TODO: add options to select sptrsv algorithms */
1843:   /* Create sptrsv handles for L, U and their transpose */
1844: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1845:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1846: #else
1847:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1848: #endif

1850:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1851:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1852:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1853:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

1855:   /* Fill fields of the factor matrix B */
1856:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1857:   b     = (Mat_SeqAIJ *)B->data;
1858:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1859:   B->info.fill_ratio_given  = info->fill;
1860:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

1862:   B->offloadmask          = PETSC_OFFLOAD_GPU;
1863:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1864:   PetscFunctionReturn(PETSC_SUCCESS);
1865: }

1867: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1868: {
1869:   PetscFunctionBegin;
1870:   *type = MATSOLVERKOKKOS;
1871:   PetscFunctionReturn(PETSC_SUCCESS);
1872: }

1874: /*MC
1875:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1876:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

1878:   Level: beginner

1880: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1881: M*/
1882: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1883: {
1884:   PetscInt n = A->rmap->n;

1886:   PetscFunctionBegin;
1887:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1888:   PetscCall(MatSetSizes(*B, n, n, n, n));
1889:   (*B)->factortype = ftype;
1890:   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1891:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));

1893:   if (ftype == MAT_FACTOR_LU) {
1894:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1895:     (*B)->canuseordering        = PETSC_TRUE;
1896:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1897:   } else if (ftype == MAT_FACTOR_ILU) {
1898:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1899:     (*B)->canuseordering         = PETSC_FALSE;
1900:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1901:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1903:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1904:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1905:   PetscFunctionReturn(PETSC_SUCCESS);
1906: }

1908: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1909: {
1910:   PetscFunctionBegin;
1911:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1912:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1913:   PetscFunctionReturn(PETSC_SUCCESS);
1914: }

1916: /* Utility to print out a KokkosCsrMatrix for debugging */
1917: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1918: {
1919:   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
1920:   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
1921:   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1922:   const PetscInt    *i  = iv.data();
1923:   const PetscInt    *j  = jv.data();
1924:   const PetscScalar *a  = av.data();
1925:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

1927:   PetscFunctionBegin;
1928:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1929:   for (PetscInt k = 0; k < m; k++) {
1930:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1931:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1932:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1933:   }
1934:   PetscFunctionReturn(PETSC_SUCCESS);
1935: }