Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscpkg_version.h>
4: #include <petsc/private/petscimpl.h>
5: #include <petsc/private/sfimpl.h>
6: #include <petscsystypes.h>
7: #include <petscerror.h>
9: #include <Kokkos_Core.hpp>
10: #include <KokkosBlas.hpp>
11: #include <KokkosSparse_CrsMatrix.hpp>
12: #include <KokkosSparse_spmv.hpp>
13: #include <KokkosSparse_spiluk.hpp>
14: #include <KokkosSparse_sptrsv.hpp>
15: #include <KokkosSparse_spgemm.hpp>
16: #include <KokkosSparse_spadd.hpp>
17: #include <KokkosBatched_LU_Decl.hpp>
18: #include <KokkosBatched_InverseLU_Decl.hpp>
20: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
22: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23: #include <KokkosSparse_Utils.hpp>
24: using KokkosSparse::sort_crs_matrix;
25: using KokkosSparse::Impl::transpose_matrix;
26: #else
27: #include <KokkosKernels_Sorting.hpp>
28: using KokkosKernels::sort_crs_matrix;
29: using KokkosKernels::Impl::transpose_matrix;
30: #endif
32: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
34: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36: In the latter case, it is important to set a_dual's sync state correctly.
37: */
38: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39: {
40: Mat_SeqAIJ *aijseq;
41: Mat_SeqAIJKokkos *aijkok;
43: PetscFunctionBegin;
44: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
47: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
50: /* If aijkok does not exist, we just copy i, j to device.
51: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52: In both cases, we build a new aijkok structure.
53: */
54: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55: delete aijkok;
56: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57: A->spptr = aijkok;
58: }
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: /* Sync CSR data to device if not yet */
64: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65: {
66: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
68: PetscFunctionBegin;
69: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
70: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71: if (aijkok->a_dual.need_sync_device()) {
72: aijkok->a_dual.sync_device();
73: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74: aijkok->hermitian_updated = PETSC_FALSE;
75: }
76: PetscFunctionReturn(PETSC_SUCCESS);
77: }
79: /* Mark the CSR data on device as modified */
80: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81: {
82: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
84: PetscFunctionBegin;
85: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86: aijkok->a_dual.clear_sync_state();
87: aijkok->a_dual.modify_device();
88: aijkok->transpose_updated = PETSC_FALSE;
89: aijkok->hermitian_updated = PETSC_FALSE;
90: PetscCall(MatSeqAIJInvalidateDiagonal(A));
91: PetscCall(PetscObjectStateIncrease((PetscObject)A));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96: {
97: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98: auto &exec = PetscGetKokkosExecutionSpace();
100: PetscFunctionBegin;
101: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
102: /* We do not expect one needs factors on host */
103: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
104: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
105: PetscCallCXX(aijkok->a_dual.sync_host(exec));
106: PetscCallCXX(exec.fence());
107: PetscFunctionReturn(PETSC_SUCCESS);
108: }
110: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
111: {
112: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
114: PetscFunctionBegin;
115: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
116: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
117: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
118: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
119: */
120: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
121: auto &exec = PetscGetKokkosExecutionSpace();
122: PetscCallCXX(aijkok->a_dual.sync_host(exec));
123: PetscCallCXX(exec.fence());
124: *array = aijkok->a_dual.view_host().data();
125: } else { /* Happens when calling MatSetValues on a newly created matrix */
126: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
127: }
128: PetscFunctionReturn(PETSC_SUCCESS);
129: }
131: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
132: {
133: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
135: PetscFunctionBegin;
136: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
141: {
142: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
144: PetscFunctionBegin;
145: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
146: auto &exec = PetscGetKokkosExecutionSpace();
147: PetscCallCXX(aijkok->a_dual.sync_host(exec));
148: PetscCallCXX(exec.fence());
149: *array = aijkok->a_dual.view_host().data();
150: } else {
151: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
152: }
153: PetscFunctionReturn(PETSC_SUCCESS);
154: }
156: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
157: {
158: PetscFunctionBegin;
159: *array = NULL;
160: PetscFunctionReturn(PETSC_SUCCESS);
161: }
163: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
164: {
165: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
167: PetscFunctionBegin;
168: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
169: *array = aijkok->a_dual.view_host().data();
170: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
171: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
172: }
173: PetscFunctionReturn(PETSC_SUCCESS);
174: }
176: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177: {
178: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
180: PetscFunctionBegin;
181: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182: aijkok->a_dual.clear_sync_state();
183: aijkok->a_dual.modify_host();
184: }
185: PetscFunctionReturn(PETSC_SUCCESS);
186: }
188: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
189: {
190: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
192: PetscFunctionBegin;
193: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
195: if (i) *i = aijkok->i_device_data();
196: if (j) *j = aijkok->j_device_data();
197: if (a) {
198: aijkok->a_dual.sync_device();
199: *a = aijkok->a_device_data();
200: }
201: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
202: PetscFunctionReturn(PETSC_SUCCESS);
203: }
205: /*
206: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
208: Input Parameter:
209: . A - the MATSEQAIJKOKKOS matrix
211: Output Parameters:
212: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
213: - T_d - the transpose on device, whose value array is allocated but not initialized
214: */
215: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
216: {
217: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
218: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
219: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
220: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
221: MatRowMapType *Ti = Ti_h.data();
222: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
223: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
224: PetscInt *Tj = Tj_h.data();
225: PetscInt *perm = perm_h.data();
226: PetscInt *offset;
228: PetscFunctionBegin;
229: // Populate Ti
230: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
231: Ti++;
232: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
233: Ti--;
234: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
236: // Populate Tj and the permutation array
237: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
238: for (PetscInt i = 0; i < m; i++) {
239: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
240: PetscInt r = Aj[j]; // row r of T
241: PetscInt disp = Ti[r] + offset[r];
243: Tj[disp] = i; // col i of T
244: perm[disp] = j;
245: offset[r]++;
246: }
247: }
248: PetscCall(PetscFree(offset));
250: // Sort each row of T, along with the permutation array
251: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
253: // Output perm and T on device
254: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
255: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
256: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
257: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: // Generate the transpose on device and cache it internally
262: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
263: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
264: {
265: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
266: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
267: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
268: KokkosCsrMatrix &T = akok->csrmatT;
270: PetscFunctionBegin;
271: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
272: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
274: const auto &Aa = akok->a_dual.view_device();
276: if (A->symmetric == PETSC_BOOL3_TRUE) {
277: *csrmatT = akok->csrmat;
278: } else {
279: // See if we already have a cached transpose and its value is up to date
280: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
281: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
282: const auto &perm = akok->transpose_perm; // get the permutation array
283: auto &Ta = T.values;
285: PetscCallCXX(Kokkos::parallel_for(
286: nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
287: }
288: } else { // Generate T of size n x m for the first time
289: MatRowMapKokkosView perm;
291: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
292: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
293: PetscCallCXX(Kokkos::parallel_for(
294: nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
295: }
296: akok->transpose_updated = PETSC_TRUE;
297: *csrmatT = akok->csrmatT;
298: }
299: PetscFunctionReturn(PETSC_SUCCESS);
300: }
302: // Generate the Hermitian on device and cache it internally
303: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
304: {
305: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
306: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
307: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
308: KokkosCsrMatrix &T = akok->csrmatH;
310: PetscFunctionBegin;
311: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
312: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314: const auto &Aa = akok->a_dual.view_device();
316: if (A->hermitian == PETSC_BOOL3_TRUE) {
317: *csrmatH = akok->csrmat;
318: } else {
319: // See if we already have a cached hermitian and its value is up to date
320: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
321: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
322: const auto &perm = akok->transpose_perm; // get the permutation array
323: auto &Ta = T.values;
325: PetscCallCXX(Kokkos::parallel_for(
326: nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327: }
328: } else { // Generate T of size n x m for the first time
329: MatRowMapKokkosView perm;
331: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333: PetscCallCXX(Kokkos::parallel_for(
334: nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
335: }
336: akok->hermitian_updated = PETSC_TRUE;
337: *csrmatH = akok->csrmatH;
338: }
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: /* y = A x */
343: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
344: {
345: Mat_SeqAIJKokkos *aijkok;
346: ConstPetscScalarKokkosView xv;
347: PetscScalarKokkosView yv;
349: PetscFunctionBegin;
350: PetscCall(PetscLogGpuTimeBegin());
351: PetscCall(MatSeqAIJKokkosSyncDevice(A));
352: PetscCall(VecGetKokkosView(xx, &xv));
353: PetscCall(VecGetKokkosViewWrite(yy, &yv));
354: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
355: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
356: PetscCall(VecRestoreKokkosView(xx, &xv));
357: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
358: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
359: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
360: PetscCall(PetscLogGpuTimeEnd());
361: PetscFunctionReturn(PETSC_SUCCESS);
362: }
364: /* y = A^T x */
365: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366: {
367: Mat_SeqAIJKokkos *aijkok;
368: const char *mode;
369: ConstPetscScalarKokkosView xv;
370: PetscScalarKokkosView yv;
371: KokkosCsrMatrix csrmat;
373: PetscFunctionBegin;
374: PetscCall(PetscLogGpuTimeBegin());
375: PetscCall(MatSeqAIJKokkosSyncDevice(A));
376: PetscCall(VecGetKokkosView(xx, &xv));
377: PetscCall(VecGetKokkosViewWrite(yy, &yv));
378: if (A->form_explicit_transpose) {
379: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
380: mode = "N";
381: } else {
382: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
383: csrmat = aijkok->csrmat;
384: mode = "T";
385: }
386: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
387: PetscCall(VecRestoreKokkosView(xx, &xv));
388: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
389: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
390: PetscCall(PetscLogGpuTimeEnd());
391: PetscFunctionReturn(PETSC_SUCCESS);
392: }
394: /* y = A^H x */
395: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
396: {
397: Mat_SeqAIJKokkos *aijkok;
398: const char *mode;
399: ConstPetscScalarKokkosView xv;
400: PetscScalarKokkosView yv;
401: KokkosCsrMatrix csrmat;
403: PetscFunctionBegin;
404: PetscCall(PetscLogGpuTimeBegin());
405: PetscCall(MatSeqAIJKokkosSyncDevice(A));
406: PetscCall(VecGetKokkosView(xx, &xv));
407: PetscCall(VecGetKokkosViewWrite(yy, &yv));
408: if (A->form_explicit_transpose) {
409: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
410: mode = "N";
411: } else {
412: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
413: csrmat = aijkok->csrmat;
414: mode = "C";
415: }
416: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
417: PetscCall(VecRestoreKokkosView(xx, &xv));
418: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
419: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
420: PetscCall(PetscLogGpuTimeEnd());
421: PetscFunctionReturn(PETSC_SUCCESS);
422: }
424: /* z = A x + y */
425: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
426: {
427: Mat_SeqAIJKokkos *aijkok;
428: ConstPetscScalarKokkosView xv, yv;
429: PetscScalarKokkosView zv;
431: PetscFunctionBegin;
432: PetscCall(PetscLogGpuTimeBegin());
433: PetscCall(MatSeqAIJKokkosSyncDevice(A));
434: PetscCall(VecGetKokkosView(xx, &xv));
435: PetscCall(VecGetKokkosView(yy, &yv));
436: PetscCall(VecGetKokkosViewWrite(zz, &zv));
437: if (zz != yy) Kokkos::deep_copy(zv, yv);
438: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
439: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
440: PetscCall(VecRestoreKokkosView(xx, &xv));
441: PetscCall(VecRestoreKokkosView(yy, &yv));
442: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
443: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
444: PetscCall(PetscLogGpuTimeEnd());
445: PetscFunctionReturn(PETSC_SUCCESS);
446: }
448: /* z = A^T x + y */
449: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
450: {
451: Mat_SeqAIJKokkos *aijkok;
452: const char *mode;
453: ConstPetscScalarKokkosView xv, yv;
454: PetscScalarKokkosView zv;
455: KokkosCsrMatrix csrmat;
457: PetscFunctionBegin;
458: PetscCall(PetscLogGpuTimeBegin());
459: PetscCall(MatSeqAIJKokkosSyncDevice(A));
460: PetscCall(VecGetKokkosView(xx, &xv));
461: PetscCall(VecGetKokkosView(yy, &yv));
462: PetscCall(VecGetKokkosViewWrite(zz, &zv));
463: if (zz != yy) Kokkos::deep_copy(zv, yv);
464: if (A->form_explicit_transpose) {
465: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
466: mode = "N";
467: } else {
468: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
469: csrmat = aijkok->csrmat;
470: mode = "T";
471: }
472: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
473: PetscCall(VecRestoreKokkosView(xx, &xv));
474: PetscCall(VecRestoreKokkosView(yy, &yv));
475: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
476: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
477: PetscCall(PetscLogGpuTimeEnd());
478: PetscFunctionReturn(PETSC_SUCCESS);
479: }
481: /* z = A^H x + y */
482: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484: Mat_SeqAIJKokkos *aijkok;
485: const char *mode;
486: ConstPetscScalarKokkosView xv, yv;
487: PetscScalarKokkosView zv;
488: KokkosCsrMatrix csrmat;
490: PetscFunctionBegin;
491: PetscCall(PetscLogGpuTimeBegin());
492: PetscCall(MatSeqAIJKokkosSyncDevice(A));
493: PetscCall(VecGetKokkosView(xx, &xv));
494: PetscCall(VecGetKokkosView(yy, &yv));
495: PetscCall(VecGetKokkosViewWrite(zz, &zv));
496: if (zz != yy) Kokkos::deep_copy(zv, yv);
497: if (A->form_explicit_transpose) {
498: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
499: mode = "N";
500: } else {
501: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
502: csrmat = aijkok->csrmat;
503: mode = "C";
504: }
505: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
506: PetscCall(VecRestoreKokkosView(xx, &xv));
507: PetscCall(VecRestoreKokkosView(yy, &yv));
508: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
509: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
510: PetscCall(PetscLogGpuTimeEnd());
511: PetscFunctionReturn(PETSC_SUCCESS);
512: }
514: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
515: {
516: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
518: PetscFunctionBegin;
519: switch (op) {
520: case MAT_FORM_EXPLICIT_TRANSPOSE:
521: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
522: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
523: A->form_explicit_transpose = flg;
524: break;
525: default:
526: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
527: break;
528: }
529: PetscFunctionReturn(PETSC_SUCCESS);
530: }
532: /* Depending on reuse, either build a new mat, or use the existing mat */
533: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
534: {
535: Mat_SeqAIJ *aseq;
537: PetscFunctionBegin;
538: PetscCall(PetscKokkosInitializeCheck());
539: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
540: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
541: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
542: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
543: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
544: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
545: PetscCall(PetscFree(A->defaultvectype));
546: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
547: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
548: PetscCall(MatSetOps_SeqAIJKokkos(A));
549: aseq = static_cast<Mat_SeqAIJ *>(A->data);
550: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
551: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
552: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
553: }
554: }
555: PetscFunctionReturn(PETSC_SUCCESS);
556: }
558: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
559: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
560: */
561: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
562: {
563: Mat_SeqAIJ *bseq;
564: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
565: Mat mat;
567: PetscFunctionBegin;
568: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
569: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
570: mat = *B;
571: if (A->preallocated) {
572: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
573: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
574: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
575: /* Now copy values to B if needed */
576: if (dupOption == MAT_COPY_VALUES) {
577: if (akok->a_dual.need_sync_device()) {
578: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
579: bkok->a_dual.modify_host();
580: } else { /* If device has the latest data, we only copy data on device */
581: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
582: bkok->a_dual.modify_device();
583: }
584: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
585: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
586: bkok->a_dual.modify_host();
587: }
588: mat->spptr = bkok;
589: }
591: PetscCall(PetscFree(mat->defaultvectype));
592: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
593: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
594: PetscCall(MatSetOps_SeqAIJKokkos(mat));
595: PetscFunctionReturn(PETSC_SUCCESS);
596: }
598: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
599: {
600: Mat At;
601: KokkosCsrMatrix internT;
602: Mat_SeqAIJKokkos *atkok, *bkok;
604: PetscFunctionBegin;
605: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
606: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
607: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
608: /* Deep copy internT, as we want to isolate the internal transpose */
609: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
610: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
611: if (reuse == MAT_INITIAL_MATRIX) *B = At;
612: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
613: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
614: if ((*B)->assembled) {
615: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
616: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
617: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
618: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
619: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
620: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
621: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
622: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
623: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
624: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
625: }
626: PetscFunctionReturn(PETSC_SUCCESS);
627: }
629: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
630: {
631: Mat_SeqAIJKokkos *aijkok;
633: PetscFunctionBegin;
634: if (A->factortype == MAT_FACTOR_NONE) {
635: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
636: delete aijkok;
637: } else {
638: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
639: }
640: A->spptr = NULL;
641: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
642: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
643: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
644: PetscCall(MatDestroy_SeqAIJ(A));
645: PetscFunctionReturn(PETSC_SUCCESS);
646: }
648: /*MC
649: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
651: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
653: Options Database Key:
654: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
656: Level: beginner
658: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
659: M*/
660: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
661: {
662: PetscFunctionBegin;
663: PetscCall(PetscKokkosInitializeCheck());
664: PetscCall(MatCreate_SeqAIJ(A));
665: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
666: PetscFunctionReturn(PETSC_SUCCESS);
667: }
669: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
670: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
671: {
672: Mat_SeqAIJ *a, *b;
673: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
674: MatScalarKokkosView aa, ba, ca;
675: MatRowMapKokkosView ai, bi, ci;
676: MatColIdxKokkosView aj, bj, cj;
677: PetscInt m, n, nnz, aN;
679: PetscFunctionBegin;
682: PetscAssertPointer(C, 4);
683: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
684: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
685: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
686: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
688: PetscCall(MatSeqAIJKokkosSyncDevice(A));
689: PetscCall(MatSeqAIJKokkosSyncDevice(B));
690: a = static_cast<Mat_SeqAIJ *>(A->data);
691: b = static_cast<Mat_SeqAIJ *>(B->data);
692: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
693: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
694: aa = akok->a_dual.view_device();
695: ai = akok->i_dual.view_device();
696: ba = bkok->a_dual.view_device();
697: bi = bkok->i_dual.view_device();
698: m = A->rmap->n; /* M, N and nnz of C */
699: n = A->cmap->n + B->cmap->n;
700: nnz = a->nz + b->nz;
701: aN = A->cmap->n; /* N of A */
702: if (reuse == MAT_INITIAL_MATRIX) {
703: aj = akok->j_dual.view_device();
704: bj = bkok->j_dual.view_device();
705: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
706: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
707: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
708: ca = ca_dual.view_device();
709: ci = ci_dual.view_device();
710: cj = cj_dual.view_device();
712: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
713: Kokkos::parallel_for(
714: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
715: PetscInt i = t.league_rank(); /* row i */
716: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
718: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
719: ci(i) = coffset;
720: if (i == m - 1) ci(m) = ai(m) + bi(m);
721: });
723: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
724: if (k < alen) {
725: ca(coffset + k) = aa(ai(i) + k);
726: cj(coffset + k) = aj(ai(i) + k);
727: } else {
728: ca(coffset + k) = ba(bi(i) + k - alen);
729: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
730: }
731: });
732: });
733: ca_dual.modify_device();
734: ci_dual.modify_device();
735: cj_dual.modify_device();
736: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
737: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
738: } else if (reuse == MAT_REUSE_MATRIX) {
740: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
741: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
742: ca = ckok->a_dual.view_device();
743: ci = ckok->i_dual.view_device();
745: Kokkos::parallel_for(
746: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
747: PetscInt i = t.league_rank(); /* row i */
748: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
749: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
750: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
751: else ca(ci(i) + k) = ba(bi(i) + k - alen);
752: });
753: });
754: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
755: }
756: PetscFunctionReturn(PETSC_SUCCESS);
757: }
759: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
760: {
761: PetscFunctionBegin;
762: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
763: PetscFunctionReturn(PETSC_SUCCESS);
764: }
766: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
767: {
768: Mat_Product *product = C->product;
769: Mat A, B;
770: bool transA, transB; /* use bool, since KK needs this type */
771: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
772: Mat_SeqAIJ *c;
773: MatProductData_SeqAIJKokkos *pdata;
774: KokkosCsrMatrix csrmatA, csrmatB;
776: PetscFunctionBegin;
777: MatCheckProduct(C, 1);
778: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
779: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
781: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
782: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
783: // we still do numeric.
784: if (pdata->reusesym) { // numeric reuses results from symbolic
785: pdata->reusesym = PETSC_FALSE;
786: PetscFunctionReturn(PETSC_SUCCESS);
787: }
789: switch (product->type) {
790: case MATPRODUCT_AB:
791: transA = false;
792: transB = false;
793: break;
794: case MATPRODUCT_AtB:
795: transA = true;
796: transB = false;
797: break;
798: case MATPRODUCT_ABt:
799: transA = false;
800: transB = true;
801: break;
802: default:
803: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
804: }
806: A = product->A;
807: B = product->B;
808: PetscCall(MatSeqAIJKokkosSyncDevice(A));
809: PetscCall(MatSeqAIJKokkosSyncDevice(B));
810: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
811: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
812: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
814: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
816: csrmatA = akok->csrmat;
817: csrmatB = bkok->csrmat;
819: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
820: if (transA) {
821: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
822: transA = false;
823: }
825: if (transB) {
826: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
827: transB = false;
828: }
829: PetscCall(PetscLogGpuTimeBegin());
830: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
831: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
832: auto spgemmHandle = pdata->kh.get_spgemm_handle();
833: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
834: #endif
836: PetscCall(PetscLogGpuTimeEnd());
837: PetscCall(MatSeqAIJKokkosModifyDevice(C));
838: /* shorter version of MatAssemblyEnd_SeqAIJ */
839: c = (Mat_SeqAIJ *)C->data;
840: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
841: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
842: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
843: c->reallocs = 0;
844: C->info.mallocs = 0;
845: C->info.nz_unneeded = 0;
846: C->assembled = C->was_assembled = PETSC_TRUE;
847: C->num_ass++;
848: PetscFunctionReturn(PETSC_SUCCESS);
849: }
851: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
852: {
853: Mat_Product *product = C->product;
854: MatProductType ptype;
855: Mat A, B;
856: bool transA, transB;
857: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
858: MatProductData_SeqAIJKokkos *pdata;
859: MPI_Comm comm;
860: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
862: PetscFunctionBegin;
863: MatCheckProduct(C, 1);
864: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
865: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
866: A = product->A;
867: B = product->B;
868: PetscCall(MatSeqAIJKokkosSyncDevice(A));
869: PetscCall(MatSeqAIJKokkosSyncDevice(B));
870: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
871: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
872: csrmatA = akok->csrmat;
873: csrmatB = bkok->csrmat;
875: ptype = product->type;
876: // Take advantage of the symmetry if true
877: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
878: ptype = MATPRODUCT_AB;
879: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
880: }
881: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
882: ptype = MATPRODUCT_AB;
883: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
884: }
886: switch (ptype) {
887: case MATPRODUCT_AB:
888: transA = false;
889: transB = false;
890: break;
891: case MATPRODUCT_AtB:
892: transA = true;
893: transB = false;
894: break;
895: case MATPRODUCT_ABt:
896: transA = false;
897: transB = true;
898: break;
899: default:
900: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
901: }
902: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
903: pdata->reusesym = product->api_user;
905: /* TODO: add command line options to select spgemm algorithms */
906: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
908: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
909: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
910: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
911: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
912: #endif
913: #endif
914: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
916: PetscCall(PetscLogGpuTimeBegin());
917: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
918: if (transA) {
919: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
920: transA = false;
921: }
923: if (transB) {
924: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
925: transB = false;
926: }
928: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
929: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
930: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
931: calling new Mat_SeqAIJKokkos().
932: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
933: */
934: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
935: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
936: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
937: auto spgemmHandle = pdata->kh.get_spgemm_handle();
938: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
939: #endif
940: PetscCall(PetscLogGpuTimeEnd());
942: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
943: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
944: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
945: PetscFunctionReturn(PETSC_SUCCESS);
946: }
948: /* handles sparse matrix matrix ops */
949: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
950: {
951: Mat_Product *product = mat->product;
952: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
954: PetscFunctionBegin;
955: MatCheckProduct(mat, 1);
956: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
957: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
958: if (Biskok && Ciskok) {
959: switch (product->type) {
960: case MATPRODUCT_AB:
961: case MATPRODUCT_AtB:
962: case MATPRODUCT_ABt:
963: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
964: break;
965: case MATPRODUCT_PtAP:
966: case MATPRODUCT_RARt:
967: case MATPRODUCT_ABC:
968: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
969: break;
970: default:
971: break;
972: }
973: } else { /* fallback for AIJ */
974: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
975: }
976: PetscFunctionReturn(PETSC_SUCCESS);
977: }
979: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
980: {
981: Mat_SeqAIJKokkos *aijkok;
983: PetscFunctionBegin;
984: PetscCall(PetscLogGpuTimeBegin());
985: PetscCall(MatSeqAIJKokkosSyncDevice(A));
986: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
987: KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
988: PetscCall(MatSeqAIJKokkosModifyDevice(A));
989: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
990: PetscCall(PetscLogGpuTimeEnd());
991: PetscFunctionReturn(PETSC_SUCCESS);
992: }
994: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
995: {
996: Mat_SeqAIJKokkos *aijkok;
998: PetscFunctionBegin;
999: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1000: if (aijkok) { /* Only zero the device if data is already there */
1001: KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1002: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003: } else { /* Might be preallocated but not assembled */
1004: PetscCall(MatZeroEntries_SeqAIJ(A));
1005: }
1006: PetscFunctionReturn(PETSC_SUCCESS);
1007: }
1009: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1010: {
1011: Mat_SeqAIJ *aijseq;
1012: Mat_SeqAIJKokkos *aijkok;
1013: PetscInt n;
1014: PetscScalarKokkosView xv;
1016: PetscFunctionBegin;
1017: PetscCall(VecGetLocalSize(x, &n));
1018: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1019: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1021: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1022: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1024: if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1025: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1026: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1027: aijkok->SetDiagonal(aijseq->diag);
1028: }
1030: const auto &Aa = aijkok->a_dual.view_device();
1031: const auto &Ai = aijkok->i_dual.view_device();
1032: const auto &Adiag = aijkok->diag_dual.view_device();
1034: PetscCall(VecGetKokkosViewWrite(x, &xv));
1035: Kokkos::parallel_for(
1036: n, KOKKOS_LAMBDA(const PetscInt i) {
1037: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1038: else xv(i) = 0;
1039: });
1040: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1041: PetscFunctionReturn(PETSC_SUCCESS);
1042: }
1044: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1045: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1046: {
1047: Mat_SeqAIJKokkos *aijkok;
1049: PetscFunctionBegin;
1051: PetscAssertPointer(kv, 2);
1052: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1053: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055: *kv = aijkok->a_dual.view_device();
1056: PetscFunctionReturn(PETSC_SUCCESS);
1057: }
1059: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1060: {
1061: PetscFunctionBegin;
1063: PetscAssertPointer(kv, 2);
1064: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1065: PetscFunctionReturn(PETSC_SUCCESS);
1066: }
1068: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1069: {
1070: Mat_SeqAIJKokkos *aijkok;
1072: PetscFunctionBegin;
1074: PetscAssertPointer(kv, 2);
1075: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1076: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1077: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1078: *kv = aijkok->a_dual.view_device();
1079: PetscFunctionReturn(PETSC_SUCCESS);
1080: }
1082: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1083: {
1084: PetscFunctionBegin;
1086: PetscAssertPointer(kv, 2);
1087: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1088: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1089: PetscFunctionReturn(PETSC_SUCCESS);
1090: }
1092: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1093: {
1094: Mat_SeqAIJKokkos *aijkok;
1096: PetscFunctionBegin;
1098: PetscAssertPointer(kv, 2);
1099: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1100: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101: *kv = aijkok->a_dual.view_device();
1102: PetscFunctionReturn(PETSC_SUCCESS);
1103: }
1105: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1106: {
1107: PetscFunctionBegin;
1109: PetscAssertPointer(kv, 2);
1110: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1111: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1112: PetscFunctionReturn(PETSC_SUCCESS);
1113: }
1115: /* Computes Y += alpha X */
1116: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1117: {
1118: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1119: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1120: ConstMatScalarKokkosView Xa;
1121: MatScalarKokkosView Ya;
1123: PetscFunctionBegin;
1124: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1125: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1126: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1127: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1128: PetscCall(PetscLogGpuTimeBegin());
1130: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1131: PetscBool e;
1132: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1133: if (e) {
1134: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1135: if (e) pattern = SAME_NONZERO_PATTERN;
1136: }
1137: }
1139: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1140: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1141: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1142: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1143: */
1144: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1145: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1146: Xa = xkok->a_dual.view_device();
1147: Ya = ykok->a_dual.view_device();
1149: if (pattern == SAME_NONZERO_PATTERN) {
1150: KokkosBlas::axpy(alpha, Xa, Ya);
1151: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1152: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1153: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1154: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1156: Kokkos::parallel_for(
1157: Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1158: PetscInt i = t.league_rank(); // row i
1159: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1160: // Only one thread works in a team
1161: PetscInt p, q = Yi(i);
1162: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1163: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1164: if (Xj(p) == Yj(q)) { // Found it
1165: Ya(q) += alpha * Xa(p);
1166: q++;
1167: } else {
1168: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1169: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1170: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1171: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1172: #else
1173: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1174: #endif
1175: }
1176: }
1177: });
1178: });
1179: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1180: } else { // different nonzero patterns
1181: Mat Z;
1182: KokkosCsrMatrix zcsr;
1183: KernelHandle kh;
1184: kh.create_spadd_handle(true); // X, Y are sorted
1185: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1186: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1187: zkok = new Mat_SeqAIJKokkos(zcsr);
1188: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1189: PetscCall(MatHeaderReplace(Y, &Z));
1190: kh.destroy_spadd_handle();
1191: }
1192: PetscCall(PetscLogGpuTimeEnd());
1193: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1194: PetscFunctionReturn(PETSC_SUCCESS);
1195: }
1197: struct MatCOOStruct_SeqAIJKokkos {
1198: PetscCount n;
1199: PetscCount Atot;
1200: PetscInt nz;
1201: PetscCountKokkosView jmap;
1202: PetscCountKokkosView perm;
1204: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1205: {
1206: nz = coo_h->nz;
1207: n = coo_h->n;
1208: Atot = coo_h->Atot;
1209: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1210: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1211: }
1212: };
1214: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1215: {
1216: PetscFunctionBegin;
1217: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1218: PetscFunctionReturn(PETSC_SUCCESS);
1219: }
1221: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1222: {
1223: Mat_SeqAIJKokkos *akok;
1224: Mat_SeqAIJ *aseq;
1225: PetscContainer container_h, container_d;
1226: MatCOOStruct_SeqAIJ *coo_h;
1227: MatCOOStruct_SeqAIJKokkos *coo_d;
1229: PetscFunctionBegin;
1230: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1231: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1232: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1233: delete akok;
1234: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1235: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1237: // Copy the COO struct to device
1238: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1239: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1240: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1242: // Put the COO struct in a container and then attach that to the matrix
1243: PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1244: PetscCall(PetscContainerSetPointer(container_d, coo_d));
1245: PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1246: PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1247: PetscCall(PetscContainerDestroy(&container_d));
1248: PetscFunctionReturn(PETSC_SUCCESS);
1249: }
1251: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1252: {
1253: MatScalarKokkosView Aa;
1254: ConstMatScalarKokkosView kv;
1255: PetscMemType memtype;
1256: PetscContainer container;
1257: MatCOOStruct_SeqAIJKokkos *coo;
1259: PetscFunctionBegin;
1260: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1261: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1263: const auto &n = coo->n;
1264: const auto &Annz = coo->nz;
1265: const auto &jmap = coo->jmap;
1266: const auto &perm = coo->perm;
1268: PetscCall(PetscGetMemType(v, &memtype));
1269: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1270: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1271: } else {
1272: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1273: }
1275: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1276: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1278: PetscCall(PetscLogGpuTimeBegin());
1279: Kokkos::parallel_for(
1280: Annz, KOKKOS_LAMBDA(const PetscCount i) {
1281: PetscScalar sum = 0.0;
1282: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1283: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1284: });
1285: PetscCall(PetscLogGpuTimeEnd());
1287: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1288: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1289: PetscFunctionReturn(PETSC_SUCCESS);
1290: }
1292: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293: {
1294: PetscFunctionBegin;
1295: PetscCall(MatSeqAIJKokkosSyncHost(A));
1296: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297: B->offloadmask = PETSC_OFFLOAD_CPU;
1298: PetscFunctionReturn(PETSC_SUCCESS);
1299: }
1301: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302: {
1303: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1305: PetscFunctionBegin;
1306: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307: A->boundtocpu = PETSC_FALSE;
1309: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1310: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1311: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1312: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1313: A->ops->scale = MatScale_SeqAIJKokkos;
1314: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1315: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1316: A->ops->mult = MatMult_SeqAIJKokkos;
1317: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1318: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1319: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1320: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1321: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1324: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1325: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1326: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1327: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1334: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: /*
1340: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1342: Input Parameters:
1343: + A - the MATSEQAIJKOKKOS matrix
1344: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1345: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1346: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1347: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1349: Output Parameter:
1350: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1351: */
1352: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1353: {
1354: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1355: PetscInt N = A->rmap->n;
1356: PetscInt nblocks = bs.extent(0) - 1;
1358: PetscFunctionBegin;
1359: // Set the diagonal pointer on device if not already
1360: if (N && akok->diag_dual.extent(0) == 0) {
1361: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1362: akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag);
1363: }
1365: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1367: // Pull out the diagonal blocks of the matrix and then invert the blocks
1368: auto Aa = akok->a_dual.view_device();
1369: auto Ai = akok->i_dual.view_device();
1370: auto Aj = akok->j_dual.view_device();
1371: auto Adiag = akok->diag_dual.view_device();
1372: // TODO: how to tune the team size?
1373: #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1374: auto ts = Kokkos::AUTO();
1375: #else
1376: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1377: #endif
1378: PetscCallCXX(Kokkos::parallel_for(
1379: Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1380: const PetscInt bid = teamMember.league_rank(); // block id
1381: const PetscInt rstart = bs(bid); // this block starts from this row
1382: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1383: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1384: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1386: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1387: PetscInt i = rstart + r; // i-th row in A
1389: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1390: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1392: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1393: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1394: B(r, c) = 0.0;
1395: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1396: B(r, c) = Aa(first + c);
1397: } else { // this entry does not show up in the CSR
1398: B(r, c) = 0.0;
1399: }
1400: }
1401: } else { // rare case that the diagonal does not exist
1402: const PetscInt begin = Ai(i);
1403: const PetscInt end = Ai(i + 1);
1404: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1405: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1406: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1407: else if (Aj(j) >= rstart + m) break;
1408: }
1409: }
1410: });
1412: // LU-decompose B (w/o pivoting) and then invert B
1413: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1414: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1415: }));
1416: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1417: PetscFunctionReturn(PETSC_SUCCESS);
1418: }
1420: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1421: {
1422: Mat_SeqAIJ *aseq;
1423: PetscInt i, m, n;
1424: auto &exec = PetscGetKokkosExecutionSpace();
1426: PetscFunctionBegin;
1427: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1429: m = akok->nrows();
1430: n = akok->ncols();
1431: PetscCall(MatSetSizes(A, m, n, m, n));
1432: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1434: /* Set up data structures of A as a MATSEQAIJ */
1435: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1436: aseq = (Mat_SeqAIJ *)(A)->data;
1438: PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1439: PetscCallCXX(akok->j_dual.sync_host(exec));
1440: PetscCallCXX(exec.fence());
1442: aseq->i = akok->i_host_data();
1443: aseq->j = akok->j_host_data();
1444: aseq->a = akok->a_host_data();
1445: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1446: aseq->singlemalloc = PETSC_FALSE;
1447: aseq->free_a = PETSC_FALSE;
1448: aseq->free_ij = PETSC_FALSE;
1449: aseq->nz = akok->nnz();
1450: aseq->maxnz = aseq->nz;
1452: PetscCall(PetscMalloc1(m, &aseq->imax));
1453: PetscCall(PetscMalloc1(m, &aseq->ilen));
1454: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1456: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1457: akok->nonzerostate = A->nonzerostate;
1458: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1459: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1460: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1461: PetscFunctionReturn(PETSC_SUCCESS);
1462: }
1464: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1465: {
1466: PetscFunctionBegin;
1467: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1468: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1469: PetscFunctionReturn(PETSC_SUCCESS);
1470: }
1472: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1473: {
1474: Mat_SeqAIJKokkos *akok;
1475: PetscFunctionBegin;
1476: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1477: PetscCall(MatCreate(comm, A));
1478: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1479: PetscFunctionReturn(PETSC_SUCCESS);
1480: }
1482: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1484: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1485: */
1486: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1487: {
1488: PetscFunctionBegin;
1489: PetscCall(MatCreate(comm, A));
1490: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1491: PetscFunctionReturn(PETSC_SUCCESS);
1492: }
1494: /*@C
1495: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1496: (the default parallel PETSc format). This matrix will ultimately be handled by
1497: Kokkos for calculations.
1499: Collective
1501: Input Parameters:
1502: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1503: . m - number of rows
1504: . n - number of columns
1505: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1506: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1508: Output Parameter:
1509: . A - the matrix
1511: Level: intermediate
1513: Notes:
1514: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1515: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1516: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1518: The AIJ format, also called
1519: compressed row storage, is fully compatible with standard Fortran
1520: storage. That is, the stored row and column indices can begin at
1521: either one (as in Fortran) or zero.
1523: Specify the preallocated storage with either `nz` or `nnz` (not both).
1524: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1525: allocation.
1527: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1528: @*/
1529: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1530: {
1531: PetscFunctionBegin;
1532: PetscCall(PetscKokkosInitializeCheck());
1533: PetscCall(MatCreate(comm, A));
1534: PetscCall(MatSetSizes(*A, m, n, m, n));
1535: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1536: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1537: PetscFunctionReturn(PETSC_SUCCESS);
1538: }
1540: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1541: {
1542: PetscFunctionBegin;
1543: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1544: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1545: PetscFunctionReturn(PETSC_SUCCESS);
1546: }
1548: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1549: {
1550: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1552: PetscFunctionBegin;
1553: if (!factors->sptrsv_symbolic_completed) {
1554: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1555: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1556: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1557: }
1558: PetscFunctionReturn(PETSC_SUCCESS);
1559: }
1561: /* Check if we need to update factors etc for transpose solve */
1562: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1563: {
1564: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1565: MatColIdxType n = A->rmap->n;
1567: PetscFunctionBegin;
1568: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1569: /* Update L^T and do sptrsv symbolic */
1570: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1571: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1572: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1574: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1575: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1577: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1578: We have to sort the indices, until KK provides finer control options.
1579: */
1580: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1582: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1584: /* Update U^T and do sptrsv symbolic */
1585: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1586: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1587: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1589: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1590: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1592: /* Sort indices. See comments above */
1593: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1595: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1596: factors->transpose_updated = PETSC_TRUE;
1597: }
1598: PetscFunctionReturn(PETSC_SUCCESS);
1599: }
1601: /* Solve Ax = b, with A = LU */
1602: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1603: {
1604: ConstPetscScalarKokkosView bv;
1605: PetscScalarKokkosView xv;
1606: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1608: PetscFunctionBegin;
1609: PetscCall(PetscLogGpuTimeBegin());
1610: PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1611: PetscCall(VecGetKokkosView(b, &bv));
1612: PetscCall(VecGetKokkosViewWrite(x, &xv));
1613: /* Solve L tmpv = b */
1614: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1615: /* Solve Ux = tmpv */
1616: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1617: PetscCall(VecRestoreKokkosView(b, &bv));
1618: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1619: PetscCall(PetscLogGpuTimeEnd());
1620: PetscFunctionReturn(PETSC_SUCCESS);
1621: }
1623: /* Solve A^T x = b, where A^T = U^T L^T */
1624: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1625: {
1626: ConstPetscScalarKokkosView bv;
1627: PetscScalarKokkosView xv;
1628: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1630: PetscFunctionBegin;
1631: PetscCall(PetscLogGpuTimeBegin());
1632: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1633: PetscCall(VecGetKokkosView(b, &bv));
1634: PetscCall(VecGetKokkosViewWrite(x, &xv));
1635: /* Solve U^T tmpv = b */
1636: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1638: /* Solve L^T x = tmpv */
1639: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1640: PetscCall(VecRestoreKokkosView(b, &bv));
1641: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1642: PetscCall(PetscLogGpuTimeEnd());
1643: PetscFunctionReturn(PETSC_SUCCESS);
1644: }
1646: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1647: {
1648: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1649: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1650: PetscInt fill_lev = info->levels;
1652: PetscFunctionBegin;
1653: PetscCall(PetscLogGpuTimeBegin());
1654: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1656: auto a_d = aijkok->a_dual.view_device();
1657: auto i_d = aijkok->i_dual.view_device();
1658: auto j_d = aijkok->j_dual.view_device();
1660: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1662: B->assembled = PETSC_TRUE;
1663: B->preallocated = PETSC_TRUE;
1664: B->ops->solve = MatSolve_SeqAIJKokkos;
1665: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1666: B->ops->matsolve = NULL;
1667: B->ops->matsolvetranspose = NULL;
1668: B->offloadmask = PETSC_OFFLOAD_GPU;
1670: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1671: factors->transpose_updated = PETSC_FALSE;
1672: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1673: /* TODO: log flops, but how to know that? */
1674: PetscCall(PetscLogGpuTimeEnd());
1675: PetscFunctionReturn(PETSC_SUCCESS);
1676: }
1678: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1679: {
1680: Mat_SeqAIJKokkos *aijkok;
1681: Mat_SeqAIJ *b;
1682: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1683: PetscInt fill_lev = info->levels;
1684: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1685: PetscInt n = A->rmap->n;
1687: PetscFunctionBegin;
1688: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1689: /* Rebuild factors */
1690: if (factors) {
1691: factors->Destroy();
1692: } /* Destroy the old if it exists */
1693: else {
1694: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1695: }
1697: /* Create a spiluk handle and then do symbolic factorization */
1698: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1699: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1701: auto spiluk_handle = factors->kh.get_spiluk_handle();
1703: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1704: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1705: Kokkos::realloc(factors->iU_d, n + 1);
1706: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1708: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1709: auto i_d = aijkok->i_dual.view_device();
1710: auto j_d = aijkok->j_dual.view_device();
1711: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1712: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1714: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1715: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1716: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1717: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1719: /* TODO: add options to select sptrsv algorithms */
1720: /* Create sptrsv handles for L, U and their transpose */
1721: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1722: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1723: #else
1724: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1725: #endif
1727: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1728: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1729: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1730: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1732: /* Fill fields of the factor matrix B */
1733: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1734: b = (Mat_SeqAIJ *)B->data;
1735: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1736: B->info.fill_ratio_given = info->fill;
1737: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1739: B->offloadmask = PETSC_OFFLOAD_GPU;
1740: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1741: PetscFunctionReturn(PETSC_SUCCESS);
1742: }
1744: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1745: {
1746: PetscFunctionBegin;
1747: *type = MATSOLVERKOKKOS;
1748: PetscFunctionReturn(PETSC_SUCCESS);
1749: }
1751: /*MC
1752: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1753: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1755: Level: beginner
1757: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1758: M*/
1759: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1760: {
1761: PetscInt n = A->rmap->n;
1763: PetscFunctionBegin;
1764: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1765: PetscCall(MatSetSizes(*B, n, n, n, n));
1766: (*B)->factortype = ftype;
1767: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1768: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1770: if (ftype == MAT_FACTOR_LU) {
1771: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1772: (*B)->canuseordering = PETSC_TRUE;
1773: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1774: } else if (ftype == MAT_FACTOR_ILU) {
1775: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1776: (*B)->canuseordering = PETSC_FALSE;
1777: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1778: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1780: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1781: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1782: PetscFunctionReturn(PETSC_SUCCESS);
1783: }
1785: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1786: {
1787: PetscFunctionBegin;
1788: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1789: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1790: PetscFunctionReturn(PETSC_SUCCESS);
1791: }
1793: /* Utility to print out a KokkosCsrMatrix for debugging */
1794: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1795: {
1796: const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1797: const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1798: const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1799: const PetscInt *i = iv.data();
1800: const PetscInt *j = jv.data();
1801: const PetscScalar *a = av.data();
1802: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1804: PetscFunctionBegin;
1805: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1806: for (PetscInt k = 0; k < m; k++) {
1807: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1808: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1809: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1810: }
1811: PetscFunctionReturn(PETSC_SUCCESS);
1812: }