Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petscsystypes.h>
8: #include <petscerror.h>
10: #include <Kokkos_Core.hpp>
11: #include <KokkosBlas.hpp>
12: #include <KokkosSparse_CrsMatrix.hpp>
14: // To suppress compiler warnings:
15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20: #include <KokkosSparse_spmv.hpp>
21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
23: #include <KokkosSparse_spiluk.hpp>
24: #include <KokkosSparse_sptrsv.hpp>
25: #include <KokkosSparse_spgemm.hpp>
26: #include <KokkosSparse_spadd.hpp>
27: #include <KokkosBatched_LU_Decl.hpp>
28: #include <KokkosBatched_InverseLU_Decl.hpp>
30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33: #include <KokkosSparse_Utils.hpp>
34: using KokkosSparse::sort_crs_matrix;
35: using KokkosSparse::Impl::transpose_matrix;
36: #else
37: #include <KokkosKernels_Sorting.hpp>
38: using KokkosKernels::sort_crs_matrix;
39: using KokkosKernels::Impl::transpose_matrix;
40: #endif
42: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
44: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
45: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
46: In the latter case, it is important to set a_dual's sync state correctly.
47: */
48: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
49: {
50: Mat_SeqAIJ *aijseq;
51: Mat_SeqAIJKokkos *aijkok;
53: PetscFunctionBegin;
54: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
55: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
57: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
58: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
60: /* If aijkok does not exist, we just copy i, j to device.
61: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
62: In both cases, we build a new aijkok structure.
63: */
64: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
65: delete aijkok;
66: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
67: A->spptr = aijkok;
68: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
69: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
70: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
71: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
72: }
73: PetscFunctionReturn(PETSC_SUCCESS);
74: }
76: /* Sync CSR data to device if not yet */
77: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
78: {
79: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
81: PetscFunctionBegin;
82: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
83: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
84: if (aijkok->a_dual.need_sync_device()) {
85: aijkok->a_dual.sync_device();
86: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
87: aijkok->hermitian_updated = PETSC_FALSE;
88: }
89: PetscFunctionReturn(PETSC_SUCCESS);
90: }
92: /* Mark the CSR data on device as modified */
93: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
94: {
95: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
97: PetscFunctionBegin;
98: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
99: aijkok->a_dual.clear_sync_state();
100: aijkok->a_dual.modify_device();
101: aijkok->transpose_updated = PETSC_FALSE;
102: aijkok->hermitian_updated = PETSC_FALSE;
103: PetscCall(MatSeqAIJInvalidateDiagonal(A));
104: PetscCall(PetscObjectStateIncrease((PetscObject)A));
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
108: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111: auto exec = PetscGetKokkosExecutionSpace();
113: PetscFunctionBegin;
114: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
115: /* We do not expect one needs factors on host */
116: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
117: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
118: PetscCallCXX(aijkok->a_dual.sync_host(exec));
119: PetscCallCXX(exec.fence());
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
124: {
125: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127: PetscFunctionBegin;
128: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
129: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
130: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
131: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
132: */
133: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
134: auto exec = PetscGetKokkosExecutionSpace();
135: PetscCallCXX(aijkok->a_dual.sync_host(exec));
136: PetscCallCXX(exec.fence());
137: *array = aijkok->a_dual.view_host().data();
138: } else { /* Happens when calling MatSetValues on a newly created matrix */
139: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
140: }
141: PetscFunctionReturn(PETSC_SUCCESS);
142: }
144: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
145: {
146: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
148: PetscFunctionBegin;
149: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
154: {
155: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157: PetscFunctionBegin;
158: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
159: auto exec = PetscGetKokkosExecutionSpace();
160: PetscCallCXX(aijkok->a_dual.sync_host(exec));
161: PetscCallCXX(exec.fence());
162: *array = aijkok->a_dual.view_host().data();
163: } else {
164: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
165: }
166: PetscFunctionReturn(PETSC_SUCCESS);
167: }
169: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170: {
171: PetscFunctionBegin;
172: *array = NULL;
173: PetscFunctionReturn(PETSC_SUCCESS);
174: }
176: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177: {
178: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
180: PetscFunctionBegin;
181: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182: *array = aijkok->a_dual.view_host().data();
183: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
184: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
185: }
186: PetscFunctionReturn(PETSC_SUCCESS);
187: }
189: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
190: {
191: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
193: PetscFunctionBegin;
194: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
195: aijkok->a_dual.clear_sync_state();
196: aijkok->a_dual.modify_host();
197: }
198: PetscFunctionReturn(PETSC_SUCCESS);
199: }
201: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
202: {
203: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205: PetscFunctionBegin;
206: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
208: if (i) *i = aijkok->i_device_data();
209: if (j) *j = aijkok->j_device_data();
210: if (a) {
211: aijkok->a_dual.sync_device();
212: *a = aijkok->a_device_data();
213: }
214: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: /*
219: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
221: Input Parameter:
222: . A - the MATSEQAIJKOKKOS matrix
224: Output Parameters:
225: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
226: - T_d - the transpose on device, whose value array is allocated but not initialized
227: */
228: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
229: {
230: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
231: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
232: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
233: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
234: MatRowMapType *Ti = Ti_h.data();
235: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
236: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
237: PetscInt *Tj = Tj_h.data();
238: PetscInt *perm = perm_h.data();
239: PetscInt *offset;
241: PetscFunctionBegin;
242: // Populate Ti
243: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
244: Ti++;
245: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
246: Ti--;
247: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
249: // Populate Tj and the permutation array
250: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
251: for (PetscInt i = 0; i < m; i++) {
252: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
253: PetscInt r = Aj[j]; // row r of T
254: PetscInt disp = Ti[r] + offset[r];
256: Tj[disp] = i; // col i of T
257: perm[disp] = j;
258: offset[r]++;
259: }
260: }
261: PetscCall(PetscFree(offset));
263: // Sort each row of T, along with the permutation array
264: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
266: // Output perm and T on device
267: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
268: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
269: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
270: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
271: PetscFunctionReturn(PETSC_SUCCESS);
272: }
274: // Generate the transpose on device and cache it internally
275: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
276: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
277: {
278: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
279: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
280: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
281: KokkosCsrMatrix &T = akok->csrmatT;
283: PetscFunctionBegin;
284: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
285: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
287: const auto &Aa = akok->a_dual.view_device();
289: if (A->symmetric == PETSC_BOOL3_TRUE) {
290: *csrmatT = akok->csrmat;
291: } else {
292: // See if we already have a cached transpose and its value is up to date
293: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
294: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
295: const auto &perm = akok->transpose_perm; // get the permutation array
296: auto &Ta = T.values;
298: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
299: }
300: } else { // Generate T of size n x m for the first time
301: MatRowMapKokkosView perm;
303: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
304: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
305: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
306: }
307: akok->transpose_updated = PETSC_TRUE;
308: *csrmatT = akok->csrmatT;
309: }
310: PetscFunctionReturn(PETSC_SUCCESS);
311: }
313: // Generate the Hermitian on device and cache it internally
314: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
315: {
316: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
317: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
318: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
319: KokkosCsrMatrix &T = akok->csrmatH;
321: PetscFunctionBegin;
322: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
323: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
325: const auto &Aa = akok->a_dual.view_device();
327: if (A->hermitian == PETSC_BOOL3_TRUE) {
328: *csrmatH = akok->csrmat;
329: } else {
330: // See if we already have a cached hermitian and its value is up to date
331: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
332: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
333: const auto &perm = akok->transpose_perm; // get the permutation array
334: auto &Ta = T.values;
336: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
337: }
338: } else { // Generate T of size n x m for the first time
339: MatRowMapKokkosView perm;
341: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
342: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
343: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
344: }
345: akok->hermitian_updated = PETSC_TRUE;
346: *csrmatH = akok->csrmatH;
347: }
348: PetscFunctionReturn(PETSC_SUCCESS);
349: }
351: /* y = A x */
352: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
353: {
354: Mat_SeqAIJKokkos *aijkok;
355: ConstPetscScalarKokkosView xv;
356: PetscScalarKokkosView yv;
358: PetscFunctionBegin;
359: PetscCall(PetscLogGpuTimeBegin());
360: PetscCall(MatSeqAIJKokkosSyncDevice(A));
361: PetscCall(VecGetKokkosView(xx, &xv));
362: PetscCall(VecGetKokkosViewWrite(yy, &yv));
363: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
364: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
365: PetscCall(VecRestoreKokkosView(xx, &xv));
366: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
367: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
368: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
369: PetscCall(PetscLogGpuTimeEnd());
370: PetscFunctionReturn(PETSC_SUCCESS);
371: }
373: /* y = A^T x */
374: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
375: {
376: Mat_SeqAIJKokkos *aijkok;
377: const char *mode;
378: ConstPetscScalarKokkosView xv;
379: PetscScalarKokkosView yv;
380: KokkosCsrMatrix csrmat;
382: PetscFunctionBegin;
383: PetscCall(PetscLogGpuTimeBegin());
384: PetscCall(MatSeqAIJKokkosSyncDevice(A));
385: PetscCall(VecGetKokkosView(xx, &xv));
386: PetscCall(VecGetKokkosViewWrite(yy, &yv));
387: if (A->form_explicit_transpose) {
388: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
389: mode = "N";
390: } else {
391: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
392: csrmat = aijkok->csrmat;
393: mode = "T";
394: }
395: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
396: PetscCall(VecRestoreKokkosView(xx, &xv));
397: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
398: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
399: PetscCall(PetscLogGpuTimeEnd());
400: PetscFunctionReturn(PETSC_SUCCESS);
401: }
403: /* y = A^H x */
404: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405: {
406: Mat_SeqAIJKokkos *aijkok;
407: const char *mode;
408: ConstPetscScalarKokkosView xv;
409: PetscScalarKokkosView yv;
410: KokkosCsrMatrix csrmat;
412: PetscFunctionBegin;
413: PetscCall(PetscLogGpuTimeBegin());
414: PetscCall(MatSeqAIJKokkosSyncDevice(A));
415: PetscCall(VecGetKokkosView(xx, &xv));
416: PetscCall(VecGetKokkosViewWrite(yy, &yv));
417: if (A->form_explicit_transpose) {
418: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
419: mode = "N";
420: } else {
421: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422: csrmat = aijkok->csrmat;
423: mode = "C";
424: }
425: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
426: PetscCall(VecRestoreKokkosView(xx, &xv));
427: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
428: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
429: PetscCall(PetscLogGpuTimeEnd());
430: PetscFunctionReturn(PETSC_SUCCESS);
431: }
433: /* z = A x + y */
434: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
435: {
436: Mat_SeqAIJKokkos *aijkok;
437: ConstPetscScalarKokkosView xv;
438: PetscScalarKokkosView zv;
440: PetscFunctionBegin;
441: PetscCall(PetscLogGpuTimeBegin());
442: PetscCall(MatSeqAIJKokkosSyncDevice(A));
443: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
444: PetscCall(VecGetKokkosView(xx, &xv));
445: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
446: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
447: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
448: PetscCall(VecRestoreKokkosView(xx, &xv));
449: PetscCall(VecRestoreKokkosView(zz, &zv));
450: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
451: PetscCall(PetscLogGpuTimeEnd());
452: PetscFunctionReturn(PETSC_SUCCESS);
453: }
455: /* z = A^T x + y */
456: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
457: {
458: Mat_SeqAIJKokkos *aijkok;
459: const char *mode;
460: ConstPetscScalarKokkosView xv;
461: PetscScalarKokkosView zv;
462: KokkosCsrMatrix csrmat;
464: PetscFunctionBegin;
465: PetscCall(PetscLogGpuTimeBegin());
466: PetscCall(MatSeqAIJKokkosSyncDevice(A));
467: if (zz != yy) PetscCall(VecCopy(yy, zz));
468: PetscCall(VecGetKokkosView(xx, &xv));
469: PetscCall(VecGetKokkosView(zz, &zv));
470: if (A->form_explicit_transpose) {
471: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
472: mode = "N";
473: } else {
474: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
475: csrmat = aijkok->csrmat;
476: mode = "T";
477: }
478: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
479: PetscCall(VecRestoreKokkosView(xx, &xv));
480: PetscCall(VecRestoreKokkosView(zz, &zv));
481: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
482: PetscCall(PetscLogGpuTimeEnd());
483: PetscFunctionReturn(PETSC_SUCCESS);
484: }
486: /* z = A^H x + y */
487: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
488: {
489: Mat_SeqAIJKokkos *aijkok;
490: const char *mode;
491: ConstPetscScalarKokkosView xv;
492: PetscScalarKokkosView zv;
493: KokkosCsrMatrix csrmat;
495: PetscFunctionBegin;
496: PetscCall(PetscLogGpuTimeBegin());
497: PetscCall(MatSeqAIJKokkosSyncDevice(A));
498: if (zz != yy) PetscCall(VecCopy(yy, zz));
499: PetscCall(VecGetKokkosView(xx, &xv));
500: PetscCall(VecGetKokkosView(zz, &zv));
501: if (A->form_explicit_transpose) {
502: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
503: mode = "N";
504: } else {
505: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
506: csrmat = aijkok->csrmat;
507: mode = "C";
508: }
509: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
510: PetscCall(VecRestoreKokkosView(xx, &xv));
511: PetscCall(VecRestoreKokkosView(zz, &zv));
512: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
513: PetscCall(PetscLogGpuTimeEnd());
514: PetscFunctionReturn(PETSC_SUCCESS);
515: }
517: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
518: {
519: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
521: PetscFunctionBegin;
522: switch (op) {
523: case MAT_FORM_EXPLICIT_TRANSPOSE:
524: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
525: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
526: A->form_explicit_transpose = flg;
527: break;
528: default:
529: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
530: break;
531: }
532: PetscFunctionReturn(PETSC_SUCCESS);
533: }
535: /* Depending on reuse, either build a new mat, or use the existing mat */
536: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
537: {
538: Mat_SeqAIJ *aseq;
540: PetscFunctionBegin;
541: PetscCall(PetscKokkosInitializeCheck());
542: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
543: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
544: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
545: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
546: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
547: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
548: PetscCall(PetscFree(A->defaultvectype));
549: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
550: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
551: PetscCall(MatSetOps_SeqAIJKokkos(A));
552: aseq = static_cast<Mat_SeqAIJ *>(A->data);
553: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
554: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
555: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
556: }
557: }
558: PetscFunctionReturn(PETSC_SUCCESS);
559: }
561: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
562: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
563: */
564: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
565: {
566: Mat_SeqAIJ *bseq;
567: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
568: Mat mat;
570: PetscFunctionBegin;
571: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
572: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
573: mat = *B;
574: if (A->assembled) {
575: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
576: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
577: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
578: /* Now copy values to B if needed */
579: if (dupOption == MAT_COPY_VALUES) {
580: if (akok->a_dual.need_sync_device()) {
581: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
582: bkok->a_dual.modify_host();
583: } else { /* If device has the latest data, we only copy data on device */
584: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
585: bkok->a_dual.modify_device();
586: }
587: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
588: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
589: bkok->a_dual.modify_host();
590: }
591: mat->spptr = bkok;
592: }
594: PetscCall(PetscFree(mat->defaultvectype));
595: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
596: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
597: PetscCall(MatSetOps_SeqAIJKokkos(mat));
598: PetscFunctionReturn(PETSC_SUCCESS);
599: }
601: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
602: {
603: Mat At;
604: KokkosCsrMatrix internT;
605: Mat_SeqAIJKokkos *atkok, *bkok;
607: PetscFunctionBegin;
608: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
609: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
610: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
611: /* Deep copy internT, as we want to isolate the internal transpose */
612: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
613: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
614: if (reuse == MAT_INITIAL_MATRIX) *B = At;
615: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
616: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
617: if ((*B)->assembled) {
618: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
619: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
620: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
621: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
622: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
623: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
624: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
625: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
626: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
627: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
628: }
629: PetscFunctionReturn(PETSC_SUCCESS);
630: }
632: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
633: {
634: Mat_SeqAIJKokkos *aijkok;
636: PetscFunctionBegin;
637: if (A->factortype == MAT_FACTOR_NONE) {
638: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
639: delete aijkok;
640: } else {
641: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
642: }
643: A->spptr = NULL;
644: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
645: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
646: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
647: #if defined(PETSC_HAVE_HYPRE)
648: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
649: #endif
650: PetscCall(MatDestroy_SeqAIJ(A));
651: PetscFunctionReturn(PETSC_SUCCESS);
652: }
654: /*MC
655: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
657: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
659: Options Database Key:
660: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
662: Level: beginner
664: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
665: M*/
666: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
667: {
668: PetscFunctionBegin;
669: PetscCall(PetscKokkosInitializeCheck());
670: PetscCall(MatCreate_SeqAIJ(A));
671: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
672: PetscFunctionReturn(PETSC_SUCCESS);
673: }
675: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
676: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
677: {
678: Mat_SeqAIJ *a, *b;
679: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
680: MatScalarKokkosView aa, ba, ca;
681: MatRowMapKokkosView ai, bi, ci;
682: MatColIdxKokkosView aj, bj, cj;
683: PetscInt m, n, nnz, aN;
685: PetscFunctionBegin;
688: PetscAssertPointer(C, 4);
689: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
690: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
691: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
692: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
694: PetscCall(MatSeqAIJKokkosSyncDevice(A));
695: PetscCall(MatSeqAIJKokkosSyncDevice(B));
696: a = static_cast<Mat_SeqAIJ *>(A->data);
697: b = static_cast<Mat_SeqAIJ *>(B->data);
698: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
699: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
700: aa = akok->a_dual.view_device();
701: ai = akok->i_dual.view_device();
702: ba = bkok->a_dual.view_device();
703: bi = bkok->i_dual.view_device();
704: m = A->rmap->n; /* M, N and nnz of C */
705: n = A->cmap->n + B->cmap->n;
706: nnz = a->nz + b->nz;
707: aN = A->cmap->n; /* N of A */
708: if (reuse == MAT_INITIAL_MATRIX) {
709: aj = akok->j_dual.view_device();
710: bj = bkok->j_dual.view_device();
711: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
712: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
713: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
714: ca = ca_dual.view_device();
715: ci = ci_dual.view_device();
716: cj = cj_dual.view_device();
718: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
719: Kokkos::parallel_for(
720: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
721: PetscInt i = t.league_rank(); /* row i */
722: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
724: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
725: ci(i) = coffset;
726: if (i == m - 1) ci(m) = ai(m) + bi(m);
727: });
729: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
730: if (k < alen) {
731: ca(coffset + k) = aa(ai(i) + k);
732: cj(coffset + k) = aj(ai(i) + k);
733: } else {
734: ca(coffset + k) = ba(bi(i) + k - alen);
735: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
736: }
737: });
738: });
739: ca_dual.modify_device();
740: ci_dual.modify_device();
741: cj_dual.modify_device();
742: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
743: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
744: } else if (reuse == MAT_REUSE_MATRIX) {
746: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
747: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
748: ca = ckok->a_dual.view_device();
749: ci = ckok->i_dual.view_device();
751: Kokkos::parallel_for(
752: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753: PetscInt i = t.league_rank(); /* row i */
754: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
756: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
757: else ca(ci(i) + k) = ba(bi(i) + k - alen);
758: });
759: });
760: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
761: }
762: PetscFunctionReturn(PETSC_SUCCESS);
763: }
765: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
766: {
767: PetscFunctionBegin;
768: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
769: PetscFunctionReturn(PETSC_SUCCESS);
770: }
772: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
773: {
774: Mat_Product *product = C->product;
775: Mat A, B;
776: bool transA, transB; /* use bool, since KK needs this type */
777: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
778: Mat_SeqAIJ *c;
779: MatProductData_SeqAIJKokkos *pdata;
780: KokkosCsrMatrix csrmatA, csrmatB;
782: PetscFunctionBegin;
783: MatCheckProduct(C, 1);
784: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
785: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
787: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
788: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
789: // we still do numeric.
790: if (pdata->reusesym) { // numeric reuses results from symbolic
791: pdata->reusesym = PETSC_FALSE;
792: PetscFunctionReturn(PETSC_SUCCESS);
793: }
795: switch (product->type) {
796: case MATPRODUCT_AB:
797: transA = false;
798: transB = false;
799: break;
800: case MATPRODUCT_AtB:
801: transA = true;
802: transB = false;
803: break;
804: case MATPRODUCT_ABt:
805: transA = false;
806: transB = true;
807: break;
808: default:
809: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
810: }
812: A = product->A;
813: B = product->B;
814: PetscCall(MatSeqAIJKokkosSyncDevice(A));
815: PetscCall(MatSeqAIJKokkosSyncDevice(B));
816: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
817: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
818: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
820: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
822: csrmatA = akok->csrmat;
823: csrmatB = bkok->csrmat;
825: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
826: if (transA) {
827: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
828: transA = false;
829: }
831: if (transB) {
832: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
833: transB = false;
834: }
835: PetscCall(PetscLogGpuTimeBegin());
836: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
837: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
838: auto spgemmHandle = pdata->kh.get_spgemm_handle();
839: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
840: #endif
842: PetscCall(PetscLogGpuTimeEnd());
843: PetscCall(MatSeqAIJKokkosModifyDevice(C));
844: /* shorter version of MatAssemblyEnd_SeqAIJ */
845: c = (Mat_SeqAIJ *)C->data;
846: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
847: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
848: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
849: c->reallocs = 0;
850: C->info.mallocs = 0;
851: C->info.nz_unneeded = 0;
852: C->assembled = C->was_assembled = PETSC_TRUE;
853: C->num_ass++;
854: PetscFunctionReturn(PETSC_SUCCESS);
855: }
857: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
858: {
859: Mat_Product *product = C->product;
860: MatProductType ptype;
861: Mat A, B;
862: bool transA, transB;
863: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
864: MatProductData_SeqAIJKokkos *pdata;
865: MPI_Comm comm;
866: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
868: PetscFunctionBegin;
869: MatCheckProduct(C, 1);
870: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
871: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
872: A = product->A;
873: B = product->B;
874: PetscCall(MatSeqAIJKokkosSyncDevice(A));
875: PetscCall(MatSeqAIJKokkosSyncDevice(B));
876: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
877: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
878: csrmatA = akok->csrmat;
879: csrmatB = bkok->csrmat;
881: ptype = product->type;
882: // Take advantage of the symmetry if true
883: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
884: ptype = MATPRODUCT_AB;
885: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
886: }
887: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
888: ptype = MATPRODUCT_AB;
889: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
890: }
892: switch (ptype) {
893: case MATPRODUCT_AB:
894: transA = false;
895: transB = false;
896: PetscCall(MatSetBlockSizesFromMats(C, A, B));
897: break;
898: case MATPRODUCT_AtB:
899: transA = true;
900: transB = false;
901: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
902: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
903: break;
904: case MATPRODUCT_ABt:
905: transA = false;
906: transB = true;
907: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
908: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
909: break;
910: default:
911: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
912: }
913: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
914: pdata->reusesym = product->api_user;
916: /* TODO: add command line options to select spgemm algorithms */
917: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
919: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
920: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
921: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
922: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
923: #endif
924: #endif
925: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
927: PetscCall(PetscLogGpuTimeBegin());
928: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
929: if (transA) {
930: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
931: transA = false;
932: }
934: if (transB) {
935: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
936: transB = false;
937: }
939: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
940: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
941: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
942: calling new Mat_SeqAIJKokkos().
943: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
944: */
945: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
946: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
947: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
948: auto spgemmHandle = pdata->kh.get_spgemm_handle();
949: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
950: #endif
951: PetscCall(PetscLogGpuTimeEnd());
953: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
954: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
955: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
956: PetscFunctionReturn(PETSC_SUCCESS);
957: }
959: /* handles sparse matrix matrix ops */
960: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
961: {
962: Mat_Product *product = mat->product;
963: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
965: PetscFunctionBegin;
966: MatCheckProduct(mat, 1);
967: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
968: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
969: if (Biskok && Ciskok) {
970: switch (product->type) {
971: case MATPRODUCT_AB:
972: case MATPRODUCT_AtB:
973: case MATPRODUCT_ABt:
974: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
975: break;
976: case MATPRODUCT_PtAP:
977: case MATPRODUCT_RARt:
978: case MATPRODUCT_ABC:
979: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
980: break;
981: default:
982: break;
983: }
984: } else { /* fallback for AIJ */
985: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
986: }
987: PetscFunctionReturn(PETSC_SUCCESS);
988: }
990: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
991: {
992: Mat_SeqAIJKokkos *aijkok;
994: PetscFunctionBegin;
995: PetscCall(PetscLogGpuTimeBegin());
996: PetscCall(MatSeqAIJKokkosSyncDevice(A));
997: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
998: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
999: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1000: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1001: PetscCall(PetscLogGpuTimeEnd());
1002: PetscFunctionReturn(PETSC_SUCCESS);
1003: }
1005: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1006: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1007: {
1008: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1010: PetscFunctionBegin;
1011: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1012: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1014: PetscCall(PetscLogGpuTimeBegin());
1015: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1016: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1017: const auto &Aa = aijkok->a_dual.view_device();
1018: const auto &Adiag = aijkok->diag_dual.view_device();
1019: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1020: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021: PetscCall(PetscLogGpuFlops(n));
1022: PetscCall(PetscLogGpuTimeEnd());
1023: } else { // need reassembly, very slow!
1024: PetscCall(MatShift_Basic(A, a));
1025: }
1026: PetscFunctionReturn(PETSC_SUCCESS);
1027: }
1029: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1030: {
1031: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1033: PetscFunctionBegin;
1034: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1035: ConstPetscScalarKokkosView dv;
1036: PetscInt n, nv;
1038: PetscCall(PetscLogGpuTimeBegin());
1039: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1040: PetscCall(VecGetKokkosView(D, &dv));
1041: PetscCall(VecGetLocalSize(D, &nv));
1042: n = PetscMin(Y->rmap->n, Y->cmap->n);
1043: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1045: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1046: const auto &Aa = aijkok->a_dual.view_device();
1047: const auto &Adiag = aijkok->diag_dual.view_device();
1048: PetscCallCXX(Kokkos::parallel_for(
1049: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1050: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1051: else Aa(Adiag(i)) += dv(i);
1052: }));
1053: PetscCall(VecRestoreKokkosView(D, &dv));
1054: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1055: PetscCall(PetscLogGpuFlops(n));
1056: PetscCall(PetscLogGpuTimeEnd());
1057: } else { // need reassembly, very slow!
1058: PetscCall(MatDiagonalSet_Default(Y, D, is));
1059: }
1060: PetscFunctionReturn(PETSC_SUCCESS);
1061: }
1063: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1064: {
1065: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1066: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1067: ConstPetscScalarKokkosView lv, rv;
1069: PetscFunctionBegin;
1070: PetscCall(PetscLogGpuTimeBegin());
1071: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1072: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1073: const auto &Aa = aijkok->a_dual.view_device();
1074: const auto &Ai = aijkok->i_dual.view_device();
1075: const auto &Aj = aijkok->j_dual.view_device();
1076: if (ll) {
1077: PetscCall(VecGetLocalSize(ll, &m));
1078: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1079: PetscCall(VecGetKokkosView(ll, &lv));
1080: PetscCallCXX(Kokkos::parallel_for( // for each row
1081: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1082: PetscInt i = t.league_rank(); // row i
1083: PetscInt len = Ai(i + 1) - Ai(i);
1084: // scale entries on the row
1085: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1086: }));
1087: PetscCall(VecRestoreKokkosView(ll, &lv));
1088: PetscCall(PetscLogGpuFlops(nz));
1089: }
1090: if (rr) {
1091: PetscCall(VecGetLocalSize(rr, &n));
1092: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1093: PetscCall(VecGetKokkosView(rr, &rv));
1094: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1095: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1096: PetscCall(VecRestoreKokkosView(rr, &lv));
1097: PetscCall(PetscLogGpuFlops(nz));
1098: }
1099: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1100: PetscCall(PetscLogGpuTimeEnd());
1101: PetscFunctionReturn(PETSC_SUCCESS);
1102: }
1104: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1105: {
1106: Mat_SeqAIJKokkos *aijkok;
1108: PetscFunctionBegin;
1109: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1110: if (aijkok) { /* Only zero the device if data is already there */
1111: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1112: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1113: } else { /* Might be preallocated but not assembled */
1114: PetscCall(MatZeroEntries_SeqAIJ(A));
1115: }
1116: PetscFunctionReturn(PETSC_SUCCESS);
1117: }
1119: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1120: {
1121: Mat_SeqAIJKokkos *aijkok;
1122: PetscInt n;
1123: PetscScalarKokkosView xv;
1125: PetscFunctionBegin;
1126: PetscCall(VecGetLocalSize(x, &n));
1127: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1128: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1130: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1131: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1133: const auto &Aa = aijkok->a_dual.view_device();
1134: const auto &Ai = aijkok->i_dual.view_device();
1135: const auto &Adiag = aijkok->diag_dual.view_device();
1137: PetscCall(VecGetKokkosViewWrite(x, &xv));
1138: Kokkos::parallel_for(
1139: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1140: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1141: else xv(i) = 0;
1142: });
1143: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1144: PetscFunctionReturn(PETSC_SUCCESS);
1145: }
1147: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1148: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1149: {
1150: Mat_SeqAIJKokkos *aijkok;
1152: PetscFunctionBegin;
1154: PetscAssertPointer(kv, 2);
1155: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1157: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1158: *kv = aijkok->a_dual.view_device();
1159: PetscFunctionReturn(PETSC_SUCCESS);
1160: }
1162: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1163: {
1164: PetscFunctionBegin;
1166: PetscAssertPointer(kv, 2);
1167: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1168: PetscFunctionReturn(PETSC_SUCCESS);
1169: }
1171: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1172: {
1173: Mat_SeqAIJKokkos *aijkok;
1175: PetscFunctionBegin;
1177: PetscAssertPointer(kv, 2);
1178: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1180: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1181: *kv = aijkok->a_dual.view_device();
1182: PetscFunctionReturn(PETSC_SUCCESS);
1183: }
1185: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1186: {
1187: PetscFunctionBegin;
1189: PetscAssertPointer(kv, 2);
1190: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1192: PetscFunctionReturn(PETSC_SUCCESS);
1193: }
1195: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1196: {
1197: Mat_SeqAIJKokkos *aijkok;
1199: PetscFunctionBegin;
1201: PetscAssertPointer(kv, 2);
1202: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1203: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1204: *kv = aijkok->a_dual.view_device();
1205: PetscFunctionReturn(PETSC_SUCCESS);
1206: }
1208: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1209: {
1210: PetscFunctionBegin;
1212: PetscAssertPointer(kv, 2);
1213: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1214: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1215: PetscFunctionReturn(PETSC_SUCCESS);
1216: }
1218: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1219: {
1220: Mat_SeqAIJKokkos *akok;
1222: PetscFunctionBegin;
1223: auto exec = PetscGetKokkosExecutionSpace();
1224: // Create host copies of the input aij
1225: auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1226: auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1227: // Don't copy the vals to the host now
1228: auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1230: MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1231: // Note we have modified device data so it will copy lazily
1232: a_dual.modify_device();
1233: MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1234: MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1236: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1237: PetscCall(MatCreate(comm, A));
1238: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1239: PetscFunctionReturn(PETSC_SUCCESS);
1240: }
1242: /* Computes Y += alpha X */
1243: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1244: {
1245: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1246: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1247: ConstMatScalarKokkosView Xa;
1248: MatScalarKokkosView Ya;
1249: auto exec = PetscGetKokkosExecutionSpace();
1251: PetscFunctionBegin;
1252: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1253: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1254: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1255: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1256: PetscCall(PetscLogGpuTimeBegin());
1258: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1259: PetscBool e;
1260: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1261: if (e) {
1262: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1263: if (e) pattern = SAME_NONZERO_PATTERN;
1264: }
1265: }
1267: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1268: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1269: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1270: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1271: */
1272: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1273: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1274: Xa = xkok->a_dual.view_device();
1275: Ya = ykok->a_dual.view_device();
1277: if (pattern == SAME_NONZERO_PATTERN) {
1278: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1279: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1280: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1281: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1282: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1284: Kokkos::parallel_for(
1285: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1286: PetscInt i = t.league_rank(); // row i
1287: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1288: // Only one thread works in a team
1289: PetscInt p, q = Yi(i);
1290: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1291: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1292: if (Xj(p) == Yj(q)) { // Found it
1293: Ya(q) += alpha * Xa(p);
1294: q++;
1295: } else {
1296: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1297: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1298: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1299: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1300: #else
1301: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1302: #endif
1303: }
1304: }
1305: });
1306: });
1307: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1308: } else { // different nonzero patterns
1309: Mat Z;
1310: KokkosCsrMatrix zcsr;
1311: KernelHandle kh;
1312: kh.create_spadd_handle(true); // X, Y are sorted
1313: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1314: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1315: zkok = new Mat_SeqAIJKokkos(zcsr);
1316: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1317: PetscCall(MatHeaderReplace(Y, &Z));
1318: kh.destroy_spadd_handle();
1319: }
1320: PetscCall(PetscLogGpuTimeEnd());
1321: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1322: PetscFunctionReturn(PETSC_SUCCESS);
1323: }
1325: struct MatCOOStruct_SeqAIJKokkos {
1326: PetscCount n;
1327: PetscCount Atot;
1328: PetscInt nz;
1329: PetscCountKokkosView jmap;
1330: PetscCountKokkosView perm;
1332: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1333: {
1334: nz = coo_h->nz;
1335: n = coo_h->n;
1336: Atot = coo_h->Atot;
1337: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1338: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1339: }
1340: };
1342: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1343: {
1344: PetscFunctionBegin;
1345: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1346: PetscFunctionReturn(PETSC_SUCCESS);
1347: }
1349: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1350: {
1351: Mat_SeqAIJKokkos *akok;
1352: Mat_SeqAIJ *aseq;
1353: PetscContainer container_h;
1354: MatCOOStruct_SeqAIJ *coo_h;
1355: MatCOOStruct_SeqAIJKokkos *coo_d;
1357: PetscFunctionBegin;
1358: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1359: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1360: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1361: delete akok;
1362: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1363: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1365: // Copy the COO struct to device
1366: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1367: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1368: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1370: // Put the COO struct in a container and then attach that to the matrix
1371: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1372: PetscFunctionReturn(PETSC_SUCCESS);
1373: }
1375: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1376: {
1377: MatScalarKokkosView Aa;
1378: ConstMatScalarKokkosView kv;
1379: PetscMemType memtype;
1380: PetscContainer container;
1381: MatCOOStruct_SeqAIJKokkos *coo;
1383: PetscFunctionBegin;
1384: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1385: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1387: const auto &n = coo->n;
1388: const auto &Annz = coo->nz;
1389: const auto &jmap = coo->jmap;
1390: const auto &perm = coo->perm;
1392: PetscCall(PetscGetMemType(v, &memtype));
1393: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1394: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1395: } else {
1396: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1397: }
1399: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1400: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1402: PetscCall(PetscLogGpuTimeBegin());
1403: Kokkos::parallel_for(
1404: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1405: PetscScalar sum = 0.0;
1406: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1407: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1408: });
1409: PetscCall(PetscLogGpuTimeEnd());
1411: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1412: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1413: PetscFunctionReturn(PETSC_SUCCESS);
1414: }
1416: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1417: {
1418: PetscFunctionBegin;
1419: PetscCall(MatSeqAIJKokkosSyncHost(A));
1420: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1421: B->offloadmask = PETSC_OFFLOAD_CPU;
1422: PetscFunctionReturn(PETSC_SUCCESS);
1423: }
1425: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1426: {
1427: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1429: PetscFunctionBegin;
1430: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1431: A->boundtocpu = PETSC_FALSE;
1433: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1434: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1435: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1436: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1437: A->ops->scale = MatScale_SeqAIJKokkos;
1438: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1439: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1440: A->ops->mult = MatMult_SeqAIJKokkos;
1441: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1442: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1443: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1444: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1445: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1446: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1447: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1448: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1449: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1450: A->ops->shift = MatShift_SeqAIJKokkos;
1451: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1452: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1453: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1454: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1455: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1456: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1457: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1458: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1459: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1461: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1462: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1463: #if defined(PETSC_HAVE_HYPRE)
1464: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1465: #endif
1466: PetscFunctionReturn(PETSC_SUCCESS);
1467: }
1469: /*
1470: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1472: Input Parameters:
1473: + A - the MATSEQAIJKOKKOS matrix
1474: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1475: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1476: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1477: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1479: Output Parameter:
1480: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1481: */
1482: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1483: {
1484: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1485: PetscInt nblocks = bs.extent(0) - 1;
1487: PetscFunctionBegin;
1488: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1490: // Pull out the diagonal blocks of the matrix and then invert the blocks
1491: auto Aa = akok->a_dual.view_device();
1492: auto Ai = akok->i_dual.view_device();
1493: auto Aj = akok->j_dual.view_device();
1494: auto Adiag = akok->diag_dual.view_device();
1495: // TODO: how to tune the team size?
1496: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1497: auto ts = Kokkos::AUTO();
1498: #else
1499: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1500: #endif
1501: PetscCallCXX(Kokkos::parallel_for(
1502: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1503: const PetscInt bid = teamMember.league_rank(); // block id
1504: const PetscInt rstart = bs(bid); // this block starts from this row
1505: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1506: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1507: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1509: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1510: PetscInt i = rstart + r; // i-th row in A
1512: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1513: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1515: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1516: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1517: B(r, c) = 0.0;
1518: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1519: B(r, c) = Aa(first + c);
1520: } else { // this entry does not show up in the CSR
1521: B(r, c) = 0.0;
1522: }
1523: }
1524: } else { // rare case that the diagonal does not exist
1525: const PetscInt begin = Ai(i);
1526: const PetscInt end = Ai(i + 1);
1527: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1528: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1529: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1530: else if (Aj(j) >= rstart + m) break;
1531: }
1532: }
1533: });
1535: // LU-decompose B (w/o pivoting) and then invert B
1536: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1537: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1538: }));
1539: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1540: PetscFunctionReturn(PETSC_SUCCESS);
1541: }
1543: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1544: {
1545: Mat_SeqAIJ *aseq;
1546: PetscInt i, m, n;
1547: auto exec = PetscGetKokkosExecutionSpace();
1549: PetscFunctionBegin;
1550: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1552: m = akok->nrows();
1553: n = akok->ncols();
1554: PetscCall(MatSetSizes(A, m, n, m, n));
1555: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1557: /* Set up data structures of A as a MATSEQAIJ */
1558: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1559: aseq = (Mat_SeqAIJ *)A->data;
1561: PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1562: PetscCallCXX(akok->j_dual.sync_host(exec));
1563: PetscCallCXX(exec.fence());
1565: aseq->i = akok->i_host_data();
1566: aseq->j = akok->j_host_data();
1567: aseq->a = akok->a_host_data();
1568: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1569: aseq->free_a = PETSC_FALSE;
1570: aseq->free_ij = PETSC_FALSE;
1571: aseq->nz = akok->nnz();
1572: aseq->maxnz = aseq->nz;
1574: PetscCall(PetscMalloc1(m, &aseq->imax));
1575: PetscCall(PetscMalloc1(m, &aseq->ilen));
1576: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1578: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1579: akok->nonzerostate = A->nonzerostate;
1580: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1581: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1582: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1587: {
1588: PetscFunctionBegin;
1589: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1590: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1591: PetscFunctionReturn(PETSC_SUCCESS);
1592: }
1594: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1595: {
1596: Mat_SeqAIJKokkos *akok;
1598: PetscFunctionBegin;
1599: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1600: PetscCall(MatCreate(comm, A));
1601: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1602: PetscFunctionReturn(PETSC_SUCCESS);
1603: }
1605: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1607: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1608: */
1609: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1610: {
1611: PetscFunctionBegin;
1612: PetscCall(MatCreate(comm, A));
1613: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1614: PetscFunctionReturn(PETSC_SUCCESS);
1615: }
1617: /*@C
1618: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1619: (the default parallel PETSc format). This matrix will ultimately be handled by
1620: Kokkos for calculations.
1622: Collective
1624: Input Parameters:
1625: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1626: . m - number of rows
1627: . n - number of columns
1628: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1629: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1631: Output Parameter:
1632: . A - the matrix
1634: Level: intermediate
1636: Notes:
1637: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1638: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1639: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1641: The AIJ format, also called
1642: compressed row storage, is fully compatible with standard Fortran
1643: storage. That is, the stored row and column indices can begin at
1644: either one (as in Fortran) or zero.
1646: Specify the preallocated storage with either `nz` or `nnz` (not both).
1647: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1648: allocation.
1650: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1651: @*/
1652: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1653: {
1654: PetscFunctionBegin;
1655: PetscCall(PetscKokkosInitializeCheck());
1656: PetscCall(MatCreate(comm, A));
1657: PetscCall(MatSetSizes(*A, m, n, m, n));
1658: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1659: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1660: PetscFunctionReturn(PETSC_SUCCESS);
1661: }
1663: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1664: {
1665: PetscFunctionBegin;
1666: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1667: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1668: PetscFunctionReturn(PETSC_SUCCESS);
1669: }
1671: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1672: {
1673: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675: PetscFunctionBegin;
1676: if (!factors->sptrsv_symbolic_completed) {
1677: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1678: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1679: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1680: }
1681: PetscFunctionReturn(PETSC_SUCCESS);
1682: }
1684: /* Check if we need to update factors etc for transpose solve */
1685: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1686: {
1687: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1688: MatColIdxType n = A->rmap->n;
1690: PetscFunctionBegin;
1691: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1692: /* Update L^T and do sptrsv symbolic */
1693: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1694: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1695: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1697: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1698: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1700: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1701: We have to sort the indices, until KK provides finer control options.
1702: */
1703: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1705: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1707: /* Update U^T and do sptrsv symbolic */
1708: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1709: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1710: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1712: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1713: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1715: /* Sort indices. See comments above */
1716: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1718: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1719: factors->transpose_updated = PETSC_TRUE;
1720: }
1721: PetscFunctionReturn(PETSC_SUCCESS);
1722: }
1724: /* Solve Ax = b, with A = LU */
1725: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1726: {
1727: ConstPetscScalarKokkosView bv;
1728: PetscScalarKokkosView xv;
1729: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1731: PetscFunctionBegin;
1732: PetscCall(PetscLogGpuTimeBegin());
1733: PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1734: PetscCall(VecGetKokkosView(b, &bv));
1735: PetscCall(VecGetKokkosViewWrite(x, &xv));
1736: /* Solve L tmpv = b */
1737: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1738: /* Solve Ux = tmpv */
1739: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1740: PetscCall(VecRestoreKokkosView(b, &bv));
1741: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1742: PetscCall(PetscLogGpuTimeEnd());
1743: PetscFunctionReturn(PETSC_SUCCESS);
1744: }
1746: /* Solve A^T x = b, where A^T = U^T L^T */
1747: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1748: {
1749: ConstPetscScalarKokkosView bv;
1750: PetscScalarKokkosView xv;
1751: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1753: PetscFunctionBegin;
1754: PetscCall(PetscLogGpuTimeBegin());
1755: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1756: PetscCall(VecGetKokkosView(b, &bv));
1757: PetscCall(VecGetKokkosViewWrite(x, &xv));
1758: /* Solve U^T tmpv = b */
1759: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1761: /* Solve L^T x = tmpv */
1762: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1763: PetscCall(VecRestoreKokkosView(b, &bv));
1764: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1765: PetscCall(PetscLogGpuTimeEnd());
1766: PetscFunctionReturn(PETSC_SUCCESS);
1767: }
1769: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1770: {
1771: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1772: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1773: PetscInt fill_lev = info->levels;
1775: PetscFunctionBegin;
1776: PetscCall(PetscLogGpuTimeBegin());
1777: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1779: auto a_d = aijkok->a_dual.view_device();
1780: auto i_d = aijkok->i_dual.view_device();
1781: auto j_d = aijkok->j_dual.view_device();
1783: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1785: B->assembled = PETSC_TRUE;
1786: B->preallocated = PETSC_TRUE;
1787: B->ops->solve = MatSolve_SeqAIJKokkos;
1788: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1789: B->ops->matsolve = NULL;
1790: B->ops->matsolvetranspose = NULL;
1791: B->offloadmask = PETSC_OFFLOAD_GPU;
1793: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1794: factors->transpose_updated = PETSC_FALSE;
1795: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1796: /* TODO: log flops, but how to know that? */
1797: PetscCall(PetscLogGpuTimeEnd());
1798: PetscFunctionReturn(PETSC_SUCCESS);
1799: }
1801: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1802: {
1803: Mat_SeqAIJKokkos *aijkok;
1804: Mat_SeqAIJ *b;
1805: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1806: PetscInt fill_lev = info->levels;
1807: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1808: PetscInt n = A->rmap->n;
1810: PetscFunctionBegin;
1811: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1812: /* Rebuild factors */
1813: if (factors) {
1814: factors->Destroy();
1815: } /* Destroy the old if it exists */
1816: else {
1817: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1818: }
1820: /* Create a spiluk handle and then do symbolic factorization */
1821: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1822: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1824: auto spiluk_handle = factors->kh.get_spiluk_handle();
1826: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1827: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1828: Kokkos::realloc(factors->iU_d, n + 1);
1829: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1831: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1832: auto i_d = aijkok->i_dual.view_device();
1833: auto j_d = aijkok->j_dual.view_device();
1834: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1835: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1837: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1838: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1839: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1840: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1842: /* TODO: add options to select sptrsv algorithms */
1843: /* Create sptrsv handles for L, U and their transpose */
1844: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1845: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1846: #else
1847: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1848: #endif
1850: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1851: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1852: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1853: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1855: /* Fill fields of the factor matrix B */
1856: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1857: b = (Mat_SeqAIJ *)B->data;
1858: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1859: B->info.fill_ratio_given = info->fill;
1860: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1862: B->offloadmask = PETSC_OFFLOAD_GPU;
1863: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1864: PetscFunctionReturn(PETSC_SUCCESS);
1865: }
1867: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1868: {
1869: PetscFunctionBegin;
1870: *type = MATSOLVERKOKKOS;
1871: PetscFunctionReturn(PETSC_SUCCESS);
1872: }
1874: /*MC
1875: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1876: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1878: Level: beginner
1880: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1881: M*/
1882: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1883: {
1884: PetscInt n = A->rmap->n;
1886: PetscFunctionBegin;
1887: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1888: PetscCall(MatSetSizes(*B, n, n, n, n));
1889: (*B)->factortype = ftype;
1890: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1891: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1893: if (ftype == MAT_FACTOR_LU) {
1894: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1895: (*B)->canuseordering = PETSC_TRUE;
1896: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1897: } else if (ftype == MAT_FACTOR_ILU) {
1898: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1899: (*B)->canuseordering = PETSC_FALSE;
1900: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1901: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1903: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1904: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1905: PetscFunctionReturn(PETSC_SUCCESS);
1906: }
1908: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1909: {
1910: PetscFunctionBegin;
1911: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1912: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1913: PetscFunctionReturn(PETSC_SUCCESS);
1914: }
1916: /* Utility to print out a KokkosCsrMatrix for debugging */
1917: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1918: {
1919: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
1920: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
1921: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1922: const PetscInt *i = iv.data();
1923: const PetscInt *j = jv.data();
1924: const PetscScalar *a = av.data();
1925: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1927: PetscFunctionBegin;
1928: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1929: for (PetscInt k = 0; k < m; k++) {
1930: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1931: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1932: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1933: }
1934: PetscFunctionReturn(PETSC_SUCCESS);
1935: }