Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscpkg_version.h>
4: #include <petsc/private/petscimpl.h>
5: #include <petsc/private/sfimpl.h>
6: #include <petscsystypes.h>
7: #include <petscerror.h>
9: #include <Kokkos_Core.hpp>
10: #include <KokkosBlas.hpp>
11: #include <KokkosSparse_CrsMatrix.hpp>
12: #include <KokkosSparse_spmv.hpp>
13: #include <KokkosSparse_spiluk.hpp>
14: #include <KokkosSparse_sptrsv.hpp>
15: #include <KokkosSparse_spgemm.hpp>
16: #include <KokkosSparse_spadd.hpp>
17: #include <KokkosBatched_LU_Decl.hpp>
18: #include <KokkosBatched_InverseLU_Decl.hpp>
20: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
22: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23: #include <KokkosSparse_Utils.hpp>
24: using KokkosSparse::sort_crs_matrix;
25: using KokkosSparse::Impl::transpose_matrix;
26: #else
27: #include <KokkosKernels_Sorting.hpp>
28: using KokkosKernels::sort_crs_matrix;
29: using KokkosKernels::Impl::transpose_matrix;
30: #endif
32: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
34: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36: In the latter case, it is important to set a_dual's sync state correctly.
37: */
38: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39: {
40: Mat_SeqAIJ *aijseq;
41: Mat_SeqAIJKokkos *aijkok;
43: PetscFunctionBegin;
44: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
47: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
50: /* If aijkok does not exist, we just copy i, j to device.
51: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52: In both cases, we build a new aijkok structure.
53: */
54: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55: delete aijkok;
56: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57: A->spptr = aijkok;
58: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
62: }
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: /* Sync CSR data to device if not yet */
67: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68: {
69: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
71: PetscFunctionBegin;
72: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74: if (aijkok->a_dual.need_sync_device()) {
75: aijkok->a_dual.sync_device();
76: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77: aijkok->hermitian_updated = PETSC_FALSE;
78: }
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: /* Mark the CSR data on device as modified */
83: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84: {
85: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
87: PetscFunctionBegin;
88: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89: aijkok->a_dual.clear_sync_state();
90: aijkok->a_dual.modify_device();
91: aijkok->transpose_updated = PETSC_FALSE;
92: aijkok->hermitian_updated = PETSC_FALSE;
93: PetscCall(MatSeqAIJInvalidateDiagonal(A));
94: PetscCall(PetscObjectStateIncrease((PetscObject)A));
95: PetscFunctionReturn(PETSC_SUCCESS);
96: }
98: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99: {
100: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101: auto &exec = PetscGetKokkosExecutionSpace();
103: PetscFunctionBegin;
104: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105: /* We do not expect one needs factors on host */
106: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108: PetscCallCXX(aijkok->a_dual.sync_host(exec));
109: PetscCallCXX(exec.fence());
110: PetscFunctionReturn(PETSC_SUCCESS);
111: }
113: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114: {
115: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
117: PetscFunctionBegin;
118: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122: */
123: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124: auto &exec = PetscGetKokkosExecutionSpace();
125: PetscCallCXX(aijkok->a_dual.sync_host(exec));
126: PetscCallCXX(exec.fence());
127: *array = aijkok->a_dual.view_host().data();
128: } else { /* Happens when calling MatSetValues on a newly created matrix */
129: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130: }
131: PetscFunctionReturn(PETSC_SUCCESS);
132: }
134: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135: {
136: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
138: PetscFunctionBegin;
139: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140: PetscFunctionReturn(PETSC_SUCCESS);
141: }
143: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144: {
145: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
147: PetscFunctionBegin;
148: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149: auto &exec = PetscGetKokkosExecutionSpace();
150: PetscCallCXX(aijkok->a_dual.sync_host(exec));
151: PetscCallCXX(exec.fence());
152: *array = aijkok->a_dual.view_host().data();
153: } else {
154: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155: }
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160: {
161: PetscFunctionBegin;
162: *array = NULL;
163: PetscFunctionReturn(PETSC_SUCCESS);
164: }
166: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167: {
168: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
170: PetscFunctionBegin;
171: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172: *array = aijkok->a_dual.view_host().data();
173: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175: }
176: PetscFunctionReturn(PETSC_SUCCESS);
177: }
179: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180: {
181: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
183: PetscFunctionBegin;
184: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185: aijkok->a_dual.clear_sync_state();
186: aijkok->a_dual.modify_host();
187: }
188: PetscFunctionReturn(PETSC_SUCCESS);
189: }
191: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192: {
193: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
195: PetscFunctionBegin;
196: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
198: if (i) *i = aijkok->i_device_data();
199: if (j) *j = aijkok->j_device_data();
200: if (a) {
201: aijkok->a_dual.sync_device();
202: *a = aijkok->a_device_data();
203: }
204: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205: PetscFunctionReturn(PETSC_SUCCESS);
206: }
208: /*
209: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
211: Input Parameter:
212: . A - the MATSEQAIJKOKKOS matrix
214: Output Parameters:
215: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216: - T_d - the transpose on device, whose value array is allocated but not initialized
217: */
218: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219: {
220: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
223: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224: MatRowMapType *Ti = Ti_h.data();
225: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227: PetscInt *Tj = Tj_h.data();
228: PetscInt *perm = perm_h.data();
229: PetscInt *offset;
231: PetscFunctionBegin;
232: // Populate Ti
233: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234: Ti++;
235: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236: Ti--;
237: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
239: // Populate Tj and the permutation array
240: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241: for (PetscInt i = 0; i < m; i++) {
242: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243: PetscInt r = Aj[j]; // row r of T
244: PetscInt disp = Ti[r] + offset[r];
246: Tj[disp] = i; // col i of T
247: perm[disp] = j;
248: offset[r]++;
249: }
250: }
251: PetscCall(PetscFree(offset));
253: // Sort each row of T, along with the permutation array
254: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
256: // Output perm and T on device
257: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261: PetscFunctionReturn(PETSC_SUCCESS);
262: }
264: // Generate the transpose on device and cache it internally
265: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267: {
268: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271: KokkosCsrMatrix &T = akok->csrmatT;
273: PetscFunctionBegin;
274: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
277: const auto &Aa = akok->a_dual.view_device();
279: if (A->symmetric == PETSC_BOOL3_TRUE) {
280: *csrmatT = akok->csrmat;
281: } else {
282: // See if we already have a cached transpose and its value is up to date
283: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
285: const auto &perm = akok->transpose_perm; // get the permutation array
286: auto &Ta = T.values;
288: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289: }
290: } else { // Generate T of size n x m for the first time
291: MatRowMapKokkosView perm;
293: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296: }
297: akok->transpose_updated = PETSC_TRUE;
298: *csrmatT = akok->csrmatT;
299: }
300: PetscFunctionReturn(PETSC_SUCCESS);
301: }
303: // Generate the Hermitian on device and cache it internally
304: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305: {
306: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309: KokkosCsrMatrix &T = akok->csrmatH;
311: PetscFunctionBegin;
312: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
315: const auto &Aa = akok->a_dual.view_device();
317: if (A->hermitian == PETSC_BOOL3_TRUE) {
318: *csrmatH = akok->csrmat;
319: } else {
320: // See if we already have a cached hermitian and its value is up to date
321: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
323: const auto &perm = akok->transpose_perm; // get the permutation array
324: auto &Ta = T.values;
326: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327: }
328: } else { // Generate T of size n x m for the first time
329: MatRowMapKokkosView perm;
331: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334: }
335: akok->hermitian_updated = PETSC_TRUE;
336: *csrmatH = akok->csrmatH;
337: }
338: PetscFunctionReturn(PETSC_SUCCESS);
339: }
341: /* y = A x */
342: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343: {
344: Mat_SeqAIJKokkos *aijkok;
345: ConstPetscScalarKokkosView xv;
346: PetscScalarKokkosView yv;
348: PetscFunctionBegin;
349: PetscCall(PetscLogGpuTimeBegin());
350: PetscCall(MatSeqAIJKokkosSyncDevice(A));
351: PetscCall(VecGetKokkosView(xx, &xv));
352: PetscCall(VecGetKokkosViewWrite(yy, &yv));
353: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355: PetscCall(VecRestoreKokkosView(xx, &xv));
356: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359: PetscCall(PetscLogGpuTimeEnd());
360: PetscFunctionReturn(PETSC_SUCCESS);
361: }
363: /* y = A^T x */
364: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365: {
366: Mat_SeqAIJKokkos *aijkok;
367: const char *mode;
368: ConstPetscScalarKokkosView xv;
369: PetscScalarKokkosView yv;
370: KokkosCsrMatrix csrmat;
372: PetscFunctionBegin;
373: PetscCall(PetscLogGpuTimeBegin());
374: PetscCall(MatSeqAIJKokkosSyncDevice(A));
375: PetscCall(VecGetKokkosView(xx, &xv));
376: PetscCall(VecGetKokkosViewWrite(yy, &yv));
377: if (A->form_explicit_transpose) {
378: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379: mode = "N";
380: } else {
381: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382: csrmat = aijkok->csrmat;
383: mode = "T";
384: }
385: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386: PetscCall(VecRestoreKokkosView(xx, &xv));
387: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389: PetscCall(PetscLogGpuTimeEnd());
390: PetscFunctionReturn(PETSC_SUCCESS);
391: }
393: /* y = A^H x */
394: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395: {
396: Mat_SeqAIJKokkos *aijkok;
397: const char *mode;
398: ConstPetscScalarKokkosView xv;
399: PetscScalarKokkosView yv;
400: KokkosCsrMatrix csrmat;
402: PetscFunctionBegin;
403: PetscCall(PetscLogGpuTimeBegin());
404: PetscCall(MatSeqAIJKokkosSyncDevice(A));
405: PetscCall(VecGetKokkosView(xx, &xv));
406: PetscCall(VecGetKokkosViewWrite(yy, &yv));
407: if (A->form_explicit_transpose) {
408: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409: mode = "N";
410: } else {
411: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412: csrmat = aijkok->csrmat;
413: mode = "C";
414: }
415: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416: PetscCall(VecRestoreKokkosView(xx, &xv));
417: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419: PetscCall(PetscLogGpuTimeEnd());
420: PetscFunctionReturn(PETSC_SUCCESS);
421: }
423: /* z = A x + y */
424: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425: {
426: Mat_SeqAIJKokkos *aijkok;
427: ConstPetscScalarKokkosView xv, yv;
428: PetscScalarKokkosView zv;
430: PetscFunctionBegin;
431: PetscCall(PetscLogGpuTimeBegin());
432: PetscCall(MatSeqAIJKokkosSyncDevice(A));
433: PetscCall(VecGetKokkosView(xx, &xv));
434: PetscCall(VecGetKokkosView(yy, &yv));
435: PetscCall(VecGetKokkosViewWrite(zz, &zv));
436: if (zz != yy) Kokkos::deep_copy(zv, yv);
437: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
439: PetscCall(VecRestoreKokkosView(xx, &xv));
440: PetscCall(VecRestoreKokkosView(yy, &yv));
441: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
442: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
443: PetscCall(PetscLogGpuTimeEnd());
444: PetscFunctionReturn(PETSC_SUCCESS);
445: }
447: /* z = A^T x + y */
448: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
449: {
450: Mat_SeqAIJKokkos *aijkok;
451: const char *mode;
452: ConstPetscScalarKokkosView xv, yv;
453: PetscScalarKokkosView zv;
454: KokkosCsrMatrix csrmat;
456: PetscFunctionBegin;
457: PetscCall(PetscLogGpuTimeBegin());
458: PetscCall(MatSeqAIJKokkosSyncDevice(A));
459: PetscCall(VecGetKokkosView(xx, &xv));
460: PetscCall(VecGetKokkosView(yy, &yv));
461: PetscCall(VecGetKokkosViewWrite(zz, &zv));
462: if (zz != yy) Kokkos::deep_copy(zv, yv);
463: if (A->form_explicit_transpose) {
464: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
465: mode = "N";
466: } else {
467: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
468: csrmat = aijkok->csrmat;
469: mode = "T";
470: }
471: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
472: PetscCall(VecRestoreKokkosView(xx, &xv));
473: PetscCall(VecRestoreKokkosView(yy, &yv));
474: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
475: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
476: PetscCall(PetscLogGpuTimeEnd());
477: PetscFunctionReturn(PETSC_SUCCESS);
478: }
480: /* z = A^H x + y */
481: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
482: {
483: Mat_SeqAIJKokkos *aijkok;
484: const char *mode;
485: ConstPetscScalarKokkosView xv, yv;
486: PetscScalarKokkosView zv;
487: KokkosCsrMatrix csrmat;
489: PetscFunctionBegin;
490: PetscCall(PetscLogGpuTimeBegin());
491: PetscCall(MatSeqAIJKokkosSyncDevice(A));
492: PetscCall(VecGetKokkosView(xx, &xv));
493: PetscCall(VecGetKokkosView(yy, &yv));
494: PetscCall(VecGetKokkosViewWrite(zz, &zv));
495: if (zz != yy) Kokkos::deep_copy(zv, yv);
496: if (A->form_explicit_transpose) {
497: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
498: mode = "N";
499: } else {
500: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501: csrmat = aijkok->csrmat;
502: mode = "C";
503: }
504: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
505: PetscCall(VecRestoreKokkosView(xx, &xv));
506: PetscCall(VecRestoreKokkosView(yy, &yv));
507: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
508: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
509: PetscCall(PetscLogGpuTimeEnd());
510: PetscFunctionReturn(PETSC_SUCCESS);
511: }
513: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
514: {
515: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
517: PetscFunctionBegin;
518: switch (op) {
519: case MAT_FORM_EXPLICIT_TRANSPOSE:
520: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
521: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
522: A->form_explicit_transpose = flg;
523: break;
524: default:
525: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
526: break;
527: }
528: PetscFunctionReturn(PETSC_SUCCESS);
529: }
531: /* Depending on reuse, either build a new mat, or use the existing mat */
532: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
533: {
534: Mat_SeqAIJ *aseq;
536: PetscFunctionBegin;
537: PetscCall(PetscKokkosInitializeCheck());
538: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
539: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
540: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
541: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
542: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
543: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
544: PetscCall(PetscFree(A->defaultvectype));
545: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
546: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
547: PetscCall(MatSetOps_SeqAIJKokkos(A));
548: aseq = static_cast<Mat_SeqAIJ *>(A->data);
549: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
550: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
551: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
552: }
553: }
554: PetscFunctionReturn(PETSC_SUCCESS);
555: }
557: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
558: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
559: */
560: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
561: {
562: Mat_SeqAIJ *bseq;
563: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
564: Mat mat;
566: PetscFunctionBegin;
567: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
568: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
569: mat = *B;
570: if (A->assembled) {
571: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
572: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
573: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
574: /* Now copy values to B if needed */
575: if (dupOption == MAT_COPY_VALUES) {
576: if (akok->a_dual.need_sync_device()) {
577: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
578: bkok->a_dual.modify_host();
579: } else { /* If device has the latest data, we only copy data on device */
580: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
581: bkok->a_dual.modify_device();
582: }
583: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
584: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
585: bkok->a_dual.modify_host();
586: }
587: mat->spptr = bkok;
588: }
590: PetscCall(PetscFree(mat->defaultvectype));
591: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
592: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
593: PetscCall(MatSetOps_SeqAIJKokkos(mat));
594: PetscFunctionReturn(PETSC_SUCCESS);
595: }
597: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
598: {
599: Mat At;
600: KokkosCsrMatrix internT;
601: Mat_SeqAIJKokkos *atkok, *bkok;
603: PetscFunctionBegin;
604: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
605: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
606: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
607: /* Deep copy internT, as we want to isolate the internal transpose */
608: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
609: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
610: if (reuse == MAT_INITIAL_MATRIX) *B = At;
611: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
612: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
613: if ((*B)->assembled) {
614: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
615: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
616: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
617: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
618: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
619: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
620: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
621: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
622: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
623: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
624: }
625: PetscFunctionReturn(PETSC_SUCCESS);
626: }
628: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
629: {
630: Mat_SeqAIJKokkos *aijkok;
632: PetscFunctionBegin;
633: if (A->factortype == MAT_FACTOR_NONE) {
634: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
635: delete aijkok;
636: } else {
637: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
638: }
639: A->spptr = NULL;
640: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
641: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
642: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
643: PetscCall(MatDestroy_SeqAIJ(A));
644: PetscFunctionReturn(PETSC_SUCCESS);
645: }
647: /*MC
648: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
650: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
652: Options Database Key:
653: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
655: Level: beginner
657: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
658: M*/
659: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
660: {
661: PetscFunctionBegin;
662: PetscCall(PetscKokkosInitializeCheck());
663: PetscCall(MatCreate_SeqAIJ(A));
664: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
665: PetscFunctionReturn(PETSC_SUCCESS);
666: }
668: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
669: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
670: {
671: Mat_SeqAIJ *a, *b;
672: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
673: MatScalarKokkosView aa, ba, ca;
674: MatRowMapKokkosView ai, bi, ci;
675: MatColIdxKokkosView aj, bj, cj;
676: PetscInt m, n, nnz, aN;
678: PetscFunctionBegin;
681: PetscAssertPointer(C, 4);
682: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
683: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
684: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
685: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
687: PetscCall(MatSeqAIJKokkosSyncDevice(A));
688: PetscCall(MatSeqAIJKokkosSyncDevice(B));
689: a = static_cast<Mat_SeqAIJ *>(A->data);
690: b = static_cast<Mat_SeqAIJ *>(B->data);
691: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
692: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
693: aa = akok->a_dual.view_device();
694: ai = akok->i_dual.view_device();
695: ba = bkok->a_dual.view_device();
696: bi = bkok->i_dual.view_device();
697: m = A->rmap->n; /* M, N and nnz of C */
698: n = A->cmap->n + B->cmap->n;
699: nnz = a->nz + b->nz;
700: aN = A->cmap->n; /* N of A */
701: if (reuse == MAT_INITIAL_MATRIX) {
702: aj = akok->j_dual.view_device();
703: bj = bkok->j_dual.view_device();
704: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
705: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
706: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
707: ca = ca_dual.view_device();
708: ci = ci_dual.view_device();
709: cj = cj_dual.view_device();
711: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
712: Kokkos::parallel_for(
713: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714: PetscInt i = t.league_rank(); /* row i */
715: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
717: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
718: ci(i) = coffset;
719: if (i == m - 1) ci(m) = ai(m) + bi(m);
720: });
722: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
723: if (k < alen) {
724: ca(coffset + k) = aa(ai(i) + k);
725: cj(coffset + k) = aj(ai(i) + k);
726: } else {
727: ca(coffset + k) = ba(bi(i) + k - alen);
728: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
729: }
730: });
731: });
732: ca_dual.modify_device();
733: ci_dual.modify_device();
734: cj_dual.modify_device();
735: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
736: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
737: } else if (reuse == MAT_REUSE_MATRIX) {
739: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
740: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
741: ca = ckok->a_dual.view_device();
742: ci = ckok->i_dual.view_device();
744: Kokkos::parallel_for(
745: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
746: PetscInt i = t.league_rank(); /* row i */
747: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
748: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
749: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
750: else ca(ci(i) + k) = ba(bi(i) + k - alen);
751: });
752: });
753: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
754: }
755: PetscFunctionReturn(PETSC_SUCCESS);
756: }
758: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
759: {
760: PetscFunctionBegin;
761: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
762: PetscFunctionReturn(PETSC_SUCCESS);
763: }
765: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
766: {
767: Mat_Product *product = C->product;
768: Mat A, B;
769: bool transA, transB; /* use bool, since KK needs this type */
770: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
771: Mat_SeqAIJ *c;
772: MatProductData_SeqAIJKokkos *pdata;
773: KokkosCsrMatrix csrmatA, csrmatB;
775: PetscFunctionBegin;
776: MatCheckProduct(C, 1);
777: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
778: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
780: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
781: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
782: // we still do numeric.
783: if (pdata->reusesym) { // numeric reuses results from symbolic
784: pdata->reusesym = PETSC_FALSE;
785: PetscFunctionReturn(PETSC_SUCCESS);
786: }
788: switch (product->type) {
789: case MATPRODUCT_AB:
790: transA = false;
791: transB = false;
792: break;
793: case MATPRODUCT_AtB:
794: transA = true;
795: transB = false;
796: break;
797: case MATPRODUCT_ABt:
798: transA = false;
799: transB = true;
800: break;
801: default:
802: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
803: }
805: A = product->A;
806: B = product->B;
807: PetscCall(MatSeqAIJKokkosSyncDevice(A));
808: PetscCall(MatSeqAIJKokkosSyncDevice(B));
809: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
810: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
811: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
813: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
815: csrmatA = akok->csrmat;
816: csrmatB = bkok->csrmat;
818: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
819: if (transA) {
820: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
821: transA = false;
822: }
824: if (transB) {
825: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
826: transB = false;
827: }
828: PetscCall(PetscLogGpuTimeBegin());
829: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
830: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
831: auto spgemmHandle = pdata->kh.get_spgemm_handle();
832: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
833: #endif
835: PetscCall(PetscLogGpuTimeEnd());
836: PetscCall(MatSeqAIJKokkosModifyDevice(C));
837: /* shorter version of MatAssemblyEnd_SeqAIJ */
838: c = (Mat_SeqAIJ *)C->data;
839: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
840: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
841: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
842: c->reallocs = 0;
843: C->info.mallocs = 0;
844: C->info.nz_unneeded = 0;
845: C->assembled = C->was_assembled = PETSC_TRUE;
846: C->num_ass++;
847: PetscFunctionReturn(PETSC_SUCCESS);
848: }
850: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
851: {
852: Mat_Product *product = C->product;
853: MatProductType ptype;
854: Mat A, B;
855: bool transA, transB;
856: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
857: MatProductData_SeqAIJKokkos *pdata;
858: MPI_Comm comm;
859: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
861: PetscFunctionBegin;
862: MatCheckProduct(C, 1);
863: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
864: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
865: A = product->A;
866: B = product->B;
867: PetscCall(MatSeqAIJKokkosSyncDevice(A));
868: PetscCall(MatSeqAIJKokkosSyncDevice(B));
869: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
870: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
871: csrmatA = akok->csrmat;
872: csrmatB = bkok->csrmat;
874: ptype = product->type;
875: // Take advantage of the symmetry if true
876: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
877: ptype = MATPRODUCT_AB;
878: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
879: }
880: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
881: ptype = MATPRODUCT_AB;
882: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
883: }
885: switch (ptype) {
886: case MATPRODUCT_AB:
887: transA = false;
888: transB = false;
889: break;
890: case MATPRODUCT_AtB:
891: transA = true;
892: transB = false;
893: break;
894: case MATPRODUCT_ABt:
895: transA = false;
896: transB = true;
897: break;
898: default:
899: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
900: }
901: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
902: pdata->reusesym = product->api_user;
904: /* TODO: add command line options to select spgemm algorithms */
905: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
907: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
908: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
909: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
910: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
911: #endif
912: #endif
913: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
915: PetscCall(PetscLogGpuTimeBegin());
916: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
917: if (transA) {
918: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
919: transA = false;
920: }
922: if (transB) {
923: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
924: transB = false;
925: }
927: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
929: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
930: calling new Mat_SeqAIJKokkos().
931: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
932: */
933: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
934: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
935: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
936: auto spgemmHandle = pdata->kh.get_spgemm_handle();
937: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
938: #endif
939: PetscCall(PetscLogGpuTimeEnd());
941: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
942: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
943: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
944: PetscFunctionReturn(PETSC_SUCCESS);
945: }
947: /* handles sparse matrix matrix ops */
948: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
949: {
950: Mat_Product *product = mat->product;
951: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
953: PetscFunctionBegin;
954: MatCheckProduct(mat, 1);
955: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
956: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
957: if (Biskok && Ciskok) {
958: switch (product->type) {
959: case MATPRODUCT_AB:
960: case MATPRODUCT_AtB:
961: case MATPRODUCT_ABt:
962: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
963: break;
964: case MATPRODUCT_PtAP:
965: case MATPRODUCT_RARt:
966: case MATPRODUCT_ABC:
967: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
968: break;
969: default:
970: break;
971: }
972: } else { /* fallback for AIJ */
973: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
974: }
975: PetscFunctionReturn(PETSC_SUCCESS);
976: }
978: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
979: {
980: Mat_SeqAIJKokkos *aijkok;
982: PetscFunctionBegin;
983: PetscCall(PetscLogGpuTimeBegin());
984: PetscCall(MatSeqAIJKokkosSyncDevice(A));
985: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
986: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
987: PetscCall(MatSeqAIJKokkosModifyDevice(A));
988: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
989: PetscCall(PetscLogGpuTimeEnd());
990: PetscFunctionReturn(PETSC_SUCCESS);
991: }
993: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
994: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
995: {
996: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
998: PetscFunctionBegin;
999: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1000: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1002: PetscCall(PetscLogGpuTimeBegin());
1003: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1004: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1005: const auto &Aa = aijkok->a_dual.view_device();
1006: const auto &Adiag = aijkok->diag_dual.view_device();
1007: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1008: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1009: PetscCall(PetscLogGpuFlops(n));
1010: PetscCall(PetscLogGpuTimeEnd());
1011: } else { // need reassembly, very slow!
1012: PetscCall(MatShift_Basic(A, a));
1013: }
1014: PetscFunctionReturn(PETSC_SUCCESS);
1015: }
1017: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1018: {
1019: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1021: PetscFunctionBegin;
1022: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1023: ConstPetscScalarKokkosView dv;
1024: PetscInt n, nv;
1026: PetscCall(PetscLogGpuTimeBegin());
1027: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1028: PetscCall(VecGetKokkosView(D, &dv));
1029: PetscCall(VecGetLocalSize(D, &nv));
1030: n = PetscMin(Y->rmap->n, Y->cmap->n);
1031: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1033: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1034: const auto &Aa = aijkok->a_dual.view_device();
1035: const auto &Adiag = aijkok->diag_dual.view_device();
1036: PetscCallCXX(Kokkos::parallel_for(
1037: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1038: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1039: else Aa(Adiag(i)) += dv(i);
1040: }));
1041: PetscCall(VecRestoreKokkosView(D, &dv));
1042: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1043: PetscCall(PetscLogGpuFlops(n));
1044: PetscCall(PetscLogGpuTimeEnd());
1045: } else { // need reassembly, very slow!
1046: PetscCall(MatDiagonalSet_Default(Y, D, is));
1047: }
1048: PetscFunctionReturn(PETSC_SUCCESS);
1049: }
1051: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1052: {
1053: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1054: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1055: ConstPetscScalarKokkosView lv, rv;
1057: PetscFunctionBegin;
1058: PetscCall(PetscLogGpuTimeBegin());
1059: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1060: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1061: const auto &Aa = aijkok->a_dual.view_device();
1062: const auto &Ai = aijkok->i_dual.view_device();
1063: const auto &Aj = aijkok->j_dual.view_device();
1064: if (ll) {
1065: PetscCall(VecGetLocalSize(ll, &m));
1066: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1067: PetscCall(VecGetKokkosView(ll, &lv));
1068: PetscCallCXX(Kokkos::parallel_for( // for each row
1069: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1070: PetscInt i = t.league_rank(); // row i
1071: PetscInt len = Ai(i + 1) - Ai(i);
1072: // scale entries on the row
1073: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1074: }));
1075: PetscCall(VecRestoreKokkosView(ll, &lv));
1076: PetscCall(PetscLogGpuFlops(nz));
1077: }
1078: if (rr) {
1079: PetscCall(VecGetLocalSize(rr, &n));
1080: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1081: PetscCall(VecGetKokkosView(rr, &rv));
1082: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1083: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1084: PetscCall(VecRestoreKokkosView(rr, &lv));
1085: PetscCall(PetscLogGpuFlops(nz));
1086: }
1087: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1088: PetscCall(PetscLogGpuTimeEnd());
1089: PetscFunctionReturn(PETSC_SUCCESS);
1090: }
1092: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1093: {
1094: Mat_SeqAIJKokkos *aijkok;
1096: PetscFunctionBegin;
1097: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1098: if (aijkok) { /* Only zero the device if data is already there */
1099: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1100: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1101: } else { /* Might be preallocated but not assembled */
1102: PetscCall(MatZeroEntries_SeqAIJ(A));
1103: }
1104: PetscFunctionReturn(PETSC_SUCCESS);
1105: }
1107: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1108: {
1109: Mat_SeqAIJKokkos *aijkok;
1110: PetscInt n;
1111: PetscScalarKokkosView xv;
1113: PetscFunctionBegin;
1114: PetscCall(VecGetLocalSize(x, &n));
1115: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1116: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1118: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1119: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1121: const auto &Aa = aijkok->a_dual.view_device();
1122: const auto &Ai = aijkok->i_dual.view_device();
1123: const auto &Adiag = aijkok->diag_dual.view_device();
1125: PetscCall(VecGetKokkosViewWrite(x, &xv));
1126: Kokkos::parallel_for(
1127: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1128: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1129: else xv(i) = 0;
1130: });
1131: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1132: PetscFunctionReturn(PETSC_SUCCESS);
1133: }
1135: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1136: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1137: {
1138: Mat_SeqAIJKokkos *aijkok;
1140: PetscFunctionBegin;
1142: PetscAssertPointer(kv, 2);
1143: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1144: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1145: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1146: *kv = aijkok->a_dual.view_device();
1147: PetscFunctionReturn(PETSC_SUCCESS);
1148: }
1150: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1151: {
1152: PetscFunctionBegin;
1154: PetscAssertPointer(kv, 2);
1155: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156: PetscFunctionReturn(PETSC_SUCCESS);
1157: }
1159: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1160: {
1161: Mat_SeqAIJKokkos *aijkok;
1163: PetscFunctionBegin;
1165: PetscAssertPointer(kv, 2);
1166: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1167: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1168: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1169: *kv = aijkok->a_dual.view_device();
1170: PetscFunctionReturn(PETSC_SUCCESS);
1171: }
1173: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1174: {
1175: PetscFunctionBegin;
1177: PetscAssertPointer(kv, 2);
1178: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1180: PetscFunctionReturn(PETSC_SUCCESS);
1181: }
1183: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1184: {
1185: Mat_SeqAIJKokkos *aijkok;
1187: PetscFunctionBegin;
1189: PetscAssertPointer(kv, 2);
1190: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192: *kv = aijkok->a_dual.view_device();
1193: PetscFunctionReturn(PETSC_SUCCESS);
1194: }
1196: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1197: {
1198: PetscFunctionBegin;
1200: PetscAssertPointer(kv, 2);
1201: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1203: PetscFunctionReturn(PETSC_SUCCESS);
1204: }
1206: /* Computes Y += alpha X */
1207: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1208: {
1209: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1210: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1211: ConstMatScalarKokkosView Xa;
1212: MatScalarKokkosView Ya;
1213: auto &exec = PetscGetKokkosExecutionSpace();
1215: PetscFunctionBegin;
1216: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1217: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1218: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1219: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1220: PetscCall(PetscLogGpuTimeBegin());
1222: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1223: PetscBool e;
1224: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1225: if (e) {
1226: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1227: if (e) pattern = SAME_NONZERO_PATTERN;
1228: }
1229: }
1231: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1232: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1233: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1234: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1235: */
1236: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1237: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1238: Xa = xkok->a_dual.view_device();
1239: Ya = ykok->a_dual.view_device();
1241: if (pattern == SAME_NONZERO_PATTERN) {
1242: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1243: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1244: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1245: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1246: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1248: Kokkos::parallel_for(
1249: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1250: PetscInt i = t.league_rank(); // row i
1251: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1252: // Only one thread works in a team
1253: PetscInt p, q = Yi(i);
1254: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1255: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1256: if (Xj(p) == Yj(q)) { // Found it
1257: Ya(q) += alpha * Xa(p);
1258: q++;
1259: } else {
1260: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1261: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1262: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1263: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1264: #else
1265: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1266: #endif
1267: }
1268: }
1269: });
1270: });
1271: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1272: } else { // different nonzero patterns
1273: Mat Z;
1274: KokkosCsrMatrix zcsr;
1275: KernelHandle kh;
1276: kh.create_spadd_handle(true); // X, Y are sorted
1277: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1278: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1279: zkok = new Mat_SeqAIJKokkos(zcsr);
1280: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1281: PetscCall(MatHeaderReplace(Y, &Z));
1282: kh.destroy_spadd_handle();
1283: }
1284: PetscCall(PetscLogGpuTimeEnd());
1285: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1286: PetscFunctionReturn(PETSC_SUCCESS);
1287: }
1289: struct MatCOOStruct_SeqAIJKokkos {
1290: PetscCount n;
1291: PetscCount Atot;
1292: PetscInt nz;
1293: PetscCountKokkosView jmap;
1294: PetscCountKokkosView perm;
1296: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1297: {
1298: nz = coo_h->nz;
1299: n = coo_h->n;
1300: Atot = coo_h->Atot;
1301: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1302: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1303: }
1304: };
1306: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1307: {
1308: PetscFunctionBegin;
1309: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1310: PetscFunctionReturn(PETSC_SUCCESS);
1311: }
1313: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1314: {
1315: Mat_SeqAIJKokkos *akok;
1316: Mat_SeqAIJ *aseq;
1317: PetscContainer container_h, container_d;
1318: MatCOOStruct_SeqAIJ *coo_h;
1319: MatCOOStruct_SeqAIJKokkos *coo_d;
1321: PetscFunctionBegin;
1322: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1323: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1324: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1325: delete akok;
1326: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1327: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1329: // Copy the COO struct to device
1330: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1331: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1332: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1334: // Put the COO struct in a container and then attach that to the matrix
1335: PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1336: PetscCall(PetscContainerSetPointer(container_d, coo_d));
1337: PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1338: PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1339: PetscCall(PetscContainerDestroy(&container_d));
1340: PetscFunctionReturn(PETSC_SUCCESS);
1341: }
1343: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1344: {
1345: MatScalarKokkosView Aa;
1346: ConstMatScalarKokkosView kv;
1347: PetscMemType memtype;
1348: PetscContainer container;
1349: MatCOOStruct_SeqAIJKokkos *coo;
1351: PetscFunctionBegin;
1352: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1353: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1355: const auto &n = coo->n;
1356: const auto &Annz = coo->nz;
1357: const auto &jmap = coo->jmap;
1358: const auto &perm = coo->perm;
1360: PetscCall(PetscGetMemType(v, &memtype));
1361: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1362: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1363: } else {
1364: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1365: }
1367: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1368: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1370: PetscCall(PetscLogGpuTimeBegin());
1371: Kokkos::parallel_for(
1372: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1373: PetscScalar sum = 0.0;
1374: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1375: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1376: });
1377: PetscCall(PetscLogGpuTimeEnd());
1379: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1380: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1381: PetscFunctionReturn(PETSC_SUCCESS);
1382: }
1384: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1385: {
1386: PetscFunctionBegin;
1387: PetscCall(MatSeqAIJKokkosSyncHost(A));
1388: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1389: B->offloadmask = PETSC_OFFLOAD_CPU;
1390: PetscFunctionReturn(PETSC_SUCCESS);
1391: }
1393: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1394: {
1395: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1397: PetscFunctionBegin;
1398: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1399: A->boundtocpu = PETSC_FALSE;
1401: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1402: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1403: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1404: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1405: A->ops->scale = MatScale_SeqAIJKokkos;
1406: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1407: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1408: A->ops->mult = MatMult_SeqAIJKokkos;
1409: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1410: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1411: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1412: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1413: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1414: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1415: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1416: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1417: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1418: A->ops->shift = MatShift_SeqAIJKokkos;
1419: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1420: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1421: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1422: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1423: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1424: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1425: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1426: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1427: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1429: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1430: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1431: PetscFunctionReturn(PETSC_SUCCESS);
1432: }
1434: /*
1435: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1437: Input Parameters:
1438: + A - the MATSEQAIJKOKKOS matrix
1439: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1440: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1441: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1442: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1444: Output Parameter:
1445: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1446: */
1447: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1448: {
1449: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1450: PetscInt nblocks = bs.extent(0) - 1;
1452: PetscFunctionBegin;
1453: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1455: // Pull out the diagonal blocks of the matrix and then invert the blocks
1456: auto Aa = akok->a_dual.view_device();
1457: auto Ai = akok->i_dual.view_device();
1458: auto Aj = akok->j_dual.view_device();
1459: auto Adiag = akok->diag_dual.view_device();
1460: // TODO: how to tune the team size?
1461: #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1462: auto ts = Kokkos::AUTO();
1463: #else
1464: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1465: #endif
1466: PetscCallCXX(Kokkos::parallel_for(
1467: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1468: const PetscInt bid = teamMember.league_rank(); // block id
1469: const PetscInt rstart = bs(bid); // this block starts from this row
1470: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1471: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1472: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1474: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1475: PetscInt i = rstart + r; // i-th row in A
1477: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1478: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1480: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1481: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1482: B(r, c) = 0.0;
1483: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1484: B(r, c) = Aa(first + c);
1485: } else { // this entry does not show up in the CSR
1486: B(r, c) = 0.0;
1487: }
1488: }
1489: } else { // rare case that the diagonal does not exist
1490: const PetscInt begin = Ai(i);
1491: const PetscInt end = Ai(i + 1);
1492: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1493: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1494: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1495: else if (Aj(j) >= rstart + m) break;
1496: }
1497: }
1498: });
1500: // LU-decompose B (w/o pivoting) and then invert B
1501: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1502: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1503: }));
1504: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1509: {
1510: Mat_SeqAIJ *aseq;
1511: PetscInt i, m, n;
1512: auto &exec = PetscGetKokkosExecutionSpace();
1514: PetscFunctionBegin;
1515: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1517: m = akok->nrows();
1518: n = akok->ncols();
1519: PetscCall(MatSetSizes(A, m, n, m, n));
1520: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1522: /* Set up data structures of A as a MATSEQAIJ */
1523: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1524: aseq = (Mat_SeqAIJ *)(A)->data;
1526: PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1527: PetscCallCXX(akok->j_dual.sync_host(exec));
1528: PetscCallCXX(exec.fence());
1530: aseq->i = akok->i_host_data();
1531: aseq->j = akok->j_host_data();
1532: aseq->a = akok->a_host_data();
1533: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1534: aseq->singlemalloc = PETSC_FALSE;
1535: aseq->free_a = PETSC_FALSE;
1536: aseq->free_ij = PETSC_FALSE;
1537: aseq->nz = akok->nnz();
1538: aseq->maxnz = aseq->nz;
1540: PetscCall(PetscMalloc1(m, &aseq->imax));
1541: PetscCall(PetscMalloc1(m, &aseq->ilen));
1542: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1544: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1545: akok->nonzerostate = A->nonzerostate;
1546: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1547: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1548: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1549: PetscFunctionReturn(PETSC_SUCCESS);
1550: }
1552: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1553: {
1554: PetscFunctionBegin;
1555: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1556: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1557: PetscFunctionReturn(PETSC_SUCCESS);
1558: }
1560: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1561: {
1562: Mat_SeqAIJKokkos *akok;
1564: PetscFunctionBegin;
1565: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1566: PetscCall(MatCreate(comm, A));
1567: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1568: PetscFunctionReturn(PETSC_SUCCESS);
1569: }
1571: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1573: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1574: */
1575: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1576: {
1577: PetscFunctionBegin;
1578: PetscCall(MatCreate(comm, A));
1579: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1580: PetscFunctionReturn(PETSC_SUCCESS);
1581: }
1583: /*@C
1584: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1585: (the default parallel PETSc format). This matrix will ultimately be handled by
1586: Kokkos for calculations.
1588: Collective
1590: Input Parameters:
1591: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1592: . m - number of rows
1593: . n - number of columns
1594: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1595: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1597: Output Parameter:
1598: . A - the matrix
1600: Level: intermediate
1602: Notes:
1603: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1604: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1605: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1607: The AIJ format, also called
1608: compressed row storage, is fully compatible with standard Fortran
1609: storage. That is, the stored row and column indices can begin at
1610: either one (as in Fortran) or zero.
1612: Specify the preallocated storage with either `nz` or `nnz` (not both).
1613: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1614: allocation.
1616: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1617: @*/
1618: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1619: {
1620: PetscFunctionBegin;
1621: PetscCall(PetscKokkosInitializeCheck());
1622: PetscCall(MatCreate(comm, A));
1623: PetscCall(MatSetSizes(*A, m, n, m, n));
1624: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1625: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1626: PetscFunctionReturn(PETSC_SUCCESS);
1627: }
1629: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1630: {
1631: PetscFunctionBegin;
1632: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1633: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1634: PetscFunctionReturn(PETSC_SUCCESS);
1635: }
1637: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1638: {
1639: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1641: PetscFunctionBegin;
1642: if (!factors->sptrsv_symbolic_completed) {
1643: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1644: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1645: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1646: }
1647: PetscFunctionReturn(PETSC_SUCCESS);
1648: }
1650: /* Check if we need to update factors etc for transpose solve */
1651: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1652: {
1653: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1654: MatColIdxType n = A->rmap->n;
1656: PetscFunctionBegin;
1657: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1658: /* Update L^T and do sptrsv symbolic */
1659: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1660: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1661: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1663: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1664: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1666: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1667: We have to sort the indices, until KK provides finer control options.
1668: */
1669: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1671: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1673: /* Update U^T and do sptrsv symbolic */
1674: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1675: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1676: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1678: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1679: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1681: /* Sort indices. See comments above */
1682: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1684: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1685: factors->transpose_updated = PETSC_TRUE;
1686: }
1687: PetscFunctionReturn(PETSC_SUCCESS);
1688: }
1690: /* Solve Ax = b, with A = LU */
1691: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1692: {
1693: ConstPetscScalarKokkosView bv;
1694: PetscScalarKokkosView xv;
1695: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1697: PetscFunctionBegin;
1698: PetscCall(PetscLogGpuTimeBegin());
1699: PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1700: PetscCall(VecGetKokkosView(b, &bv));
1701: PetscCall(VecGetKokkosViewWrite(x, &xv));
1702: /* Solve L tmpv = b */
1703: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1704: /* Solve Ux = tmpv */
1705: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1706: PetscCall(VecRestoreKokkosView(b, &bv));
1707: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1708: PetscCall(PetscLogGpuTimeEnd());
1709: PetscFunctionReturn(PETSC_SUCCESS);
1710: }
1712: /* Solve A^T x = b, where A^T = U^T L^T */
1713: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1714: {
1715: ConstPetscScalarKokkosView bv;
1716: PetscScalarKokkosView xv;
1717: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1719: PetscFunctionBegin;
1720: PetscCall(PetscLogGpuTimeBegin());
1721: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1722: PetscCall(VecGetKokkosView(b, &bv));
1723: PetscCall(VecGetKokkosViewWrite(x, &xv));
1724: /* Solve U^T tmpv = b */
1725: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1727: /* Solve L^T x = tmpv */
1728: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1729: PetscCall(VecRestoreKokkosView(b, &bv));
1730: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1731: PetscCall(PetscLogGpuTimeEnd());
1732: PetscFunctionReturn(PETSC_SUCCESS);
1733: }
1735: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1736: {
1737: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1738: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1739: PetscInt fill_lev = info->levels;
1741: PetscFunctionBegin;
1742: PetscCall(PetscLogGpuTimeBegin());
1743: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1745: auto a_d = aijkok->a_dual.view_device();
1746: auto i_d = aijkok->i_dual.view_device();
1747: auto j_d = aijkok->j_dual.view_device();
1749: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1751: B->assembled = PETSC_TRUE;
1752: B->preallocated = PETSC_TRUE;
1753: B->ops->solve = MatSolve_SeqAIJKokkos;
1754: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1755: B->ops->matsolve = NULL;
1756: B->ops->matsolvetranspose = NULL;
1757: B->offloadmask = PETSC_OFFLOAD_GPU;
1759: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1760: factors->transpose_updated = PETSC_FALSE;
1761: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1762: /* TODO: log flops, but how to know that? */
1763: PetscCall(PetscLogGpuTimeEnd());
1764: PetscFunctionReturn(PETSC_SUCCESS);
1765: }
1767: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1768: {
1769: Mat_SeqAIJKokkos *aijkok;
1770: Mat_SeqAIJ *b;
1771: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1772: PetscInt fill_lev = info->levels;
1773: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1774: PetscInt n = A->rmap->n;
1776: PetscFunctionBegin;
1777: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1778: /* Rebuild factors */
1779: if (factors) {
1780: factors->Destroy();
1781: } /* Destroy the old if it exists */
1782: else {
1783: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1784: }
1786: /* Create a spiluk handle and then do symbolic factorization */
1787: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1788: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1790: auto spiluk_handle = factors->kh.get_spiluk_handle();
1792: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1793: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1794: Kokkos::realloc(factors->iU_d, n + 1);
1795: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1797: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1798: auto i_d = aijkok->i_dual.view_device();
1799: auto j_d = aijkok->j_dual.view_device();
1800: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1801: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1803: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1804: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1805: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1806: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1808: /* TODO: add options to select sptrsv algorithms */
1809: /* Create sptrsv handles for L, U and their transpose */
1810: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1811: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1812: #else
1813: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1814: #endif
1816: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1817: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1818: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1819: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1821: /* Fill fields of the factor matrix B */
1822: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1823: b = (Mat_SeqAIJ *)B->data;
1824: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1825: B->info.fill_ratio_given = info->fill;
1826: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1828: B->offloadmask = PETSC_OFFLOAD_GPU;
1829: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1830: PetscFunctionReturn(PETSC_SUCCESS);
1831: }
1833: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1834: {
1835: PetscFunctionBegin;
1836: *type = MATSOLVERKOKKOS;
1837: PetscFunctionReturn(PETSC_SUCCESS);
1838: }
1840: /*MC
1841: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1842: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1844: Level: beginner
1846: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1847: M*/
1848: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1849: {
1850: PetscInt n = A->rmap->n;
1852: PetscFunctionBegin;
1853: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1854: PetscCall(MatSetSizes(*B, n, n, n, n));
1855: (*B)->factortype = ftype;
1856: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1857: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1859: if (ftype == MAT_FACTOR_LU) {
1860: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1861: (*B)->canuseordering = PETSC_TRUE;
1862: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1863: } else if (ftype == MAT_FACTOR_ILU) {
1864: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1865: (*B)->canuseordering = PETSC_FALSE;
1866: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1867: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1869: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1870: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1871: PetscFunctionReturn(PETSC_SUCCESS);
1872: }
1874: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1875: {
1876: PetscFunctionBegin;
1877: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1878: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1879: PetscFunctionReturn(PETSC_SUCCESS);
1880: }
1882: /* Utility to print out a KokkosCsrMatrix for debugging */
1883: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1884: {
1885: const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1886: const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1887: const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1888: const PetscInt *i = iv.data();
1889: const PetscInt *j = jv.data();
1890: const PetscScalar *a = av.data();
1891: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1893: PetscFunctionBegin;
1894: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1895: for (PetscInt k = 0; k < m; k++) {
1896: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1897: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1898: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1899: }
1900: PetscFunctionReturn(PETSC_SUCCESS);
1901: }