Actual source code: aijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscpkg_version.h>
3: #include <petsc/private/petscimpl.h>
4: #include <petsc/private/sfimpl.h>
5: #include <petscsystypes.h>
6: #include <petscerror.h>
8: #include <Kokkos_Core.hpp>
9: #include <KokkosBlas.hpp>
10: #include <KokkosKernels_Sorting.hpp>
11: #include <KokkosSparse_CrsMatrix.hpp>
12: #include <KokkosSparse_spmv.hpp>
13: #include <KokkosSparse_spiluk.hpp>
14: #include <KokkosSparse_sptrsv.hpp>
15: #include <KokkosSparse_spgemm.hpp>
16: #include <KokkosSparse_spadd.hpp>
18: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
22: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24: In the latter case, it is important to set a_dual's sync state correctly.
25: */
26: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27: {
28: Mat_SeqAIJ *aijseq;
29: Mat_SeqAIJKokkos *aijkok;
31: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
32: MatAssemblyEnd_SeqAIJ(A,mode);
34: aijseq = static_cast<Mat_SeqAIJ*>(A->data);
35: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37: /* If aijkok does not exist, we just copy i, j to device.
38: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
39: In both cases, we build a new aijkok structure.
40: */
41: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
42: delete aijkok;
43: aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
44: A->spptr = aijkok;
45: }
47: if (aijkok && aijkok->device_mat_d.data()) {
48: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
49: }
50: return 0;
51: }
53: /* Sync CSR data to device if not yet */
54: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
55: {
56: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60: if (aijkok->a_dual.need_sync_device()) {
61: aijkok->a_dual.sync_device();
62: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
63: aijkok->hermitian_updated = PETSC_FALSE;
64: }
65: return 0;
66: }
68: /* Mark the CSR data on device as modified */
69: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
70: {
71: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74: aijkok->a_dual.clear_sync_state();
75: aijkok->a_dual.modify_device();
76: aijkok->transpose_updated = PETSC_FALSE;
77: aijkok->hermitian_updated = PETSC_FALSE;
78: MatSeqAIJInvalidateDiagonal(A);
79: PetscObjectStateIncrease((PetscObject)A);
80: return 0;
81: }
83: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
84: {
85: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88: /* We do not expect one needs factors on host */
91: aijkok->a_dual.sync_host();
92: return 0;
93: }
95: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
96: {
97: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
99: if (aijkok) {
100: aijkok->a_dual.sync_host();
101: *array = aijkok->a_dual.view_host().data();
102: } else { /* Happens when calling MatSetValues on a newly created matrix */
103: *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
104: }
105: return 0;
106: }
108: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
112: if (aijkok) aijkok->a_dual.modify_host();
113: return 0;
114: }
116: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
117: {
118: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120: if (aijkok) {
121: aijkok->a_dual.sync_host();
122: *array = aijkok->a_dual.view_host().data();
123: } else {
124: *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
125: }
126: return 0;
127: }
129: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
130: {
131: *array = NULL;
132: return 0;
133: }
135: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
136: {
137: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
139: if (aijkok) {
140: *array = aijkok->a_dual.view_host().data();
141: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
142: *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
143: }
144: return 0;
145: }
147: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148: {
149: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151: if (aijkok) {
152: aijkok->a_dual.clear_sync_state();
153: aijkok->a_dual.modify_host();
154: }
155: return 0;
156: }
158: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
159: {
160: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164: if (i) *i = aijkok->i_device_data();
165: if (j) *j = aijkok->j_device_data();
166: if (a) {
167: aijkok->a_dual.sync_device();
168: *a = aijkok->a_device_data();
169: }
170: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
171: return 0;
172: }
174: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
175: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
176: {
177: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
178: Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
181: aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
182: Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
183: return 0;
184: }
186: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
187: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
188: {
189: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
191: if (aijkok && aijkok->device_mat_d.data()) {
192: *d_mat = aijkok->device_mat_d.data();
193: } else {
194: MatSeqAIJKokkosSyncDevice(A); // create aijkok (we are making d_mat now so make a place for it)
195: *d_mat = NULL;
196: }
197: return 0;
198: }
200: /* Generate the transpose on device and cache it internally */
201: static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202: {
203: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
206: if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
207: /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
208: aijkok->a_dual.sync_device();
209: aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat);
210: KokkosKernels::sort_crs_matrix(aijkok->csrmatT);
211: aijkok->transpose_updated = PETSC_TRUE;
212: }
213: *csrmatT = &aijkok->csrmatT;
214: return 0;
215: }
217: /* Generate the Hermitian on device and cache it internally */
218: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
219: {
220: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222: PetscLogGpuTimeBegin();
224: if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
225: aijkok->a_dual.sync_device();
226: aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat);
227: KokkosKernels::sort_crs_matrix(aijkok->csrmatH);
228: #if defined(PETSC_USE_COMPLEX)
229: const auto& a = aijkok->csrmatH.values;
230: Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
231: #endif
232: aijkok->hermitian_updated = PETSC_TRUE;
233: }
234: *csrmatH = &aijkok->csrmatH;
235: PetscLogGpuTimeEnd();
236: return 0;
237: }
239: /* y = A x */
240: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
241: {
242: Mat_SeqAIJKokkos *aijkok;
243: ConstPetscScalarKokkosView xv;
244: PetscScalarKokkosView yv;
246: PetscLogGpuTimeBegin();
247: MatSeqAIJKokkosSyncDevice(A);
248: VecGetKokkosView(xx,&xv);
249: VecGetKokkosViewWrite(yy,&yv);
250: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
251: KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
252: VecRestoreKokkosView(xx,&xv);
253: VecRestoreKokkosViewWrite(yy,&yv);
254: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
255: PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());
256: PetscLogGpuTimeEnd();
257: return 0;
258: }
260: /* y = A^T x */
261: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
262: {
263: Mat_SeqAIJKokkos *aijkok;
264: const char *mode;
265: ConstPetscScalarKokkosView xv;
266: PetscScalarKokkosView yv;
267: KokkosCsrMatrix *csrmat;
269: PetscLogGpuTimeBegin();
270: MatSeqAIJKokkosSyncDevice(A);
271: VecGetKokkosView(xx,&xv);
272: VecGetKokkosViewWrite(yy,&yv);
273: if (A->form_explicit_transpose) {
274: MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);
275: mode = "N";
276: } else {
277: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278: csrmat = &aijkok->csrmat;
279: mode = "T";
280: }
281: KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
282: VecRestoreKokkosView(xx,&xv);
283: VecRestoreKokkosViewWrite(yy,&yv);
284: PetscLogGpuFlops(2.0*csrmat->nnz());
285: PetscLogGpuTimeEnd();
286: return 0;
287: }
289: /* y = A^H x */
290: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
291: {
292: Mat_SeqAIJKokkos *aijkok;
293: const char *mode;
294: ConstPetscScalarKokkosView xv;
295: PetscScalarKokkosView yv;
296: KokkosCsrMatrix *csrmat;
298: PetscLogGpuTimeBegin();
299: MatSeqAIJKokkosSyncDevice(A);
300: VecGetKokkosView(xx,&xv);
301: VecGetKokkosViewWrite(yy,&yv);
302: if (A->form_explicit_transpose) {
303: MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);
304: mode = "N";
305: } else {
306: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
307: csrmat = &aijkok->csrmat;
308: mode = "C";
309: }
310: KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
311: VecRestoreKokkosView(xx,&xv);
312: VecRestoreKokkosViewWrite(yy,&yv);
313: PetscLogGpuFlops(2.0*csrmat->nnz());
314: PetscLogGpuTimeEnd();
315: return 0;
316: }
318: /* z = A x + y */
319: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
320: {
321: Mat_SeqAIJKokkos *aijkok;
322: ConstPetscScalarKokkosView xv,yv;
323: PetscScalarKokkosView zv;
325: PetscLogGpuTimeBegin();
326: MatSeqAIJKokkosSyncDevice(A);
327: VecGetKokkosView(xx,&xv);
328: VecGetKokkosView(yy,&yv);
329: VecGetKokkosViewWrite(zz,&zv);
330: if (zz != yy) Kokkos::deep_copy(zv,yv);
331: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
332: KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
333: VecRestoreKokkosView(xx,&xv);
334: VecRestoreKokkosView(yy,&yv);
335: VecRestoreKokkosViewWrite(zz,&zv);
336: PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());
337: PetscLogGpuTimeEnd();
338: return 0;
339: }
341: /* z = A^T x + y */
342: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
343: {
344: Mat_SeqAIJKokkos *aijkok;
345: const char *mode;
346: ConstPetscScalarKokkosView xv,yv;
347: PetscScalarKokkosView zv;
348: KokkosCsrMatrix *csrmat;
350: PetscLogGpuTimeBegin();
351: MatSeqAIJKokkosSyncDevice(A);
352: VecGetKokkosView(xx,&xv);
353: VecGetKokkosView(yy,&yv);
354: VecGetKokkosViewWrite(zz,&zv);
355: if (zz != yy) Kokkos::deep_copy(zv,yv);
356: if (A->form_explicit_transpose) {
357: MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);
358: mode = "N";
359: } else {
360: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
361: csrmat = &aijkok->csrmat;
362: mode = "T";
363: }
364: KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
365: VecRestoreKokkosView(xx,&xv);
366: VecRestoreKokkosView(yy,&yv);
367: VecRestoreKokkosViewWrite(zz,&zv);
368: PetscLogGpuFlops(2.0*csrmat->nnz());
369: PetscLogGpuTimeEnd();
370: return 0;
371: }
373: /* z = A^H x + y */
374: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
375: {
376: Mat_SeqAIJKokkos *aijkok;
377: const char *mode;
378: ConstPetscScalarKokkosView xv,yv;
379: PetscScalarKokkosView zv;
380: KokkosCsrMatrix *csrmat;
382: PetscLogGpuTimeBegin();
383: MatSeqAIJKokkosSyncDevice(A);
384: VecGetKokkosView(xx,&xv);
385: VecGetKokkosView(yy,&yv);
386: VecGetKokkosViewWrite(zz,&zv);
387: if (zz != yy) Kokkos::deep_copy(zv,yv);
388: if (A->form_explicit_transpose) {
389: MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);
390: mode = "N";
391: } else {
392: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
393: csrmat = &aijkok->csrmat;
394: mode = "C";
395: }
396: KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
397: VecRestoreKokkosView(xx,&xv);
398: VecRestoreKokkosView(yy,&yv);
399: VecRestoreKokkosViewWrite(zz,&zv);
400: PetscLogGpuFlops(2.0*csrmat->nnz());
401: PetscLogGpuTimeEnd();
402: return 0;
403: }
405: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
406: {
407: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
409: switch (op) {
410: case MAT_FORM_EXPLICIT_TRANSPOSE:
411: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
412: if (A->form_explicit_transpose && !flg && aijkok) aijkok->DestroyMatTranspose();
413: A->form_explicit_transpose = flg;
414: break;
415: default:
416: MatSetOption_SeqAIJ(A,op,flg);
417: break;
418: }
419: return 0;
420: }
422: /* Depending on reuse, either build a new mat, or use the existing mat */
423: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
424: {
425: Mat_SeqAIJ *aseq;
427: PetscKokkosInitializeCheck();
428: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
429: MatDuplicate(A,MAT_COPY_VALUES,newmat); /* the returned newmat is a SeqAIJKokkos */
430: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
431: MatCopy(A,*newmat,SAME_NONZERO_PATTERN); /* newmat is already a SeqAIJKokkos */
432: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
434: PetscFree(A->defaultvectype);
435: PetscStrallocpy(VECKOKKOS,&A->defaultvectype); /* Allocate and copy the string */
436: PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);
437: MatSetOps_SeqAIJKokkos(A);
438: aseq = static_cast<Mat_SeqAIJ*>(A->data);
439: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
441: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
442: }
443: }
444: return 0;
445: }
447: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
448: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
449: */
450: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
451: {
452: Mat_SeqAIJ *bseq;
453: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454: Mat mat;
456: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
457: MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
458: mat = *B;
459: if (A->assembled) {
460: bseq = static_cast<Mat_SeqAIJ*>(mat->data);
461: bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
462: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
463: /* Now copy values to B if needed */
464: if (dupOption == MAT_COPY_VALUES) {
465: if (akok->a_dual.need_sync_device()) {
466: Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
467: bkok->a_dual.modify_host();
468: } else { /* If device has the latest data, we only copy data on device */
469: Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
470: bkok->a_dual.modify_device();
471: }
472: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
473: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
474: bkok->a_dual.modify_host();
475: }
476: mat->spptr = bkok;
477: }
479: PetscFree(mat->defaultvectype);
480: PetscStrallocpy(VECKOKKOS,&mat->defaultvectype); /* Allocate and copy the string */
481: PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);
482: MatSetOps_SeqAIJKokkos(mat);
483: return 0;
484: }
486: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
487: {
488: Mat At;
489: KokkosCsrMatrix *internT;
490: Mat_SeqAIJKokkos *atkok,*bkok;
492: MatSeqAIJKokkosGenerateTranspose_Private(A,&internT); /* Generate a transpose internally */
493: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
494: /* Deep copy internT, as we want to isolate the internal transpose */
495: atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT));
496: MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);
497: if (reuse == MAT_INITIAL_MATRIX) *B = At;
498: else MatHeaderReplace(A,&At); /* Replace A with At inplace */
499: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
500: if ((*B)->assembled) {
501: bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
502: Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values);
503: MatSeqAIJKokkosModifyDevice(*B);
504: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
505: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
506: MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
507: MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
508: Kokkos::deep_copy(a_h,internT->values);
509: Kokkos::deep_copy(j_h,internT->graph.entries);
510: } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
511: }
512: return 0;
513: }
515: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
516: {
517: Mat_SeqAIJKokkos *aijkok;
519: if (A->factortype == MAT_FACTOR_NONE) {
520: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
521: delete aijkok;
522: } else {
523: delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
524: }
525: A->spptr = NULL;
526: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
527: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
528: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
529: MatDestroy_SeqAIJ(A);
530: return 0;
531: }
533: /*MC
534: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
536: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
538: Options Database Keys:
539: . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
541: Level: beginner
543: .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
544: M*/
545: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
546: {
547: PetscKokkosInitializeCheck();
548: MatCreate_SeqAIJ(A);
549: MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);
550: return 0;
551: }
553: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
554: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
555: {
556: Mat_SeqAIJ *a,*b;
557: Mat_SeqAIJKokkos *akok,*bkok,*ckok;
558: MatScalarKokkosView aa,ba,ca;
559: MatRowMapKokkosView ai,bi,ci;
560: MatColIdxKokkosView aj,bj,cj;
561: PetscInt m,n,nnz,aN;
571: MatSeqAIJKokkosSyncDevice(A);
572: MatSeqAIJKokkosSyncDevice(B);
573: a = static_cast<Mat_SeqAIJ*>(A->data);
574: b = static_cast<Mat_SeqAIJ*>(B->data);
575: akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
576: bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
577: aa = akok->a_dual.view_device();
578: ai = akok->i_dual.view_device();
579: ba = bkok->a_dual.view_device();
580: bi = bkok->i_dual.view_device();
581: m = A->rmap->n; /* M, N and nnz of C */
582: n = A->cmap->n + B->cmap->n;
583: nnz = a->nz + b->nz;
584: aN = A->cmap->n; /* N of A */
585: if (reuse == MAT_INITIAL_MATRIX) {
586: aj = akok->j_dual.view_device();
587: bj = bkok->j_dual.view_device();
588: auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
589: auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
590: auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
591: ca = ca_dual.view_device();
592: ci = ci_dual.view_device();
593: cj = cj_dual.view_device();
595: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
596: Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
597: PetscInt i = t.league_rank(); /* row i */
598: PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
600: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
601: ci(i) = coffset;
602: if (i == m-1) ci(m) = ai(m) + bi(m);
603: });
605: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
606: if (k < alen) {
607: ca(coffset+k) = aa(ai(i)+k);
608: cj(coffset+k) = aj(ai(i)+k);
609: } else {
610: ca(coffset+k) = ba(bi(i)+k-alen);
611: cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
612: }
613: });
614: });
615: ca_dual.modify_device();
616: ci_dual.modify_device();
617: cj_dual.modify_device();
618: ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual);
619: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);
620: } else if (reuse == MAT_REUSE_MATRIX) {
623: ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
624: ca = ckok->a_dual.view_device();
625: ci = ckok->i_dual.view_device();
627: Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
628: PetscInt i = t.league_rank(); /* row i */
629: PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
630: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
631: if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
632: else ca(ci(i)+k) = ba(bi(i)+k-alen);
633: });
634: });
635: MatSeqAIJKokkosModifyDevice(*C);
636: }
637: return 0;
638: }
640: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
641: {
642: delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
643: return 0;
644: }
646: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
647: {
648: Mat_Product *product = C->product;
649: Mat A,B;
650: bool transA,transB; /* use bool, since KK needs this type */
651: Mat_SeqAIJKokkos *akok,*bkok,*ckok;
652: Mat_SeqAIJ *c;
653: MatProductData_SeqAIJKokkos *pdata;
654: KokkosCsrMatrix *csrmatA,*csrmatB;
656: MatCheckProduct(C,1);
658: pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
660: if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
661: pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
662: return 0;
663: }
665: switch (product->type) {
666: case MATPRODUCT_AB: transA = false; transB = false; break;
667: case MATPRODUCT_AtB: transA = true; transB = false; break;
668: case MATPRODUCT_ABt: transA = false; transB = true; break;
669: default:
670: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
671: }
673: A = product->A;
674: B = product->B;
675: MatSeqAIJKokkosSyncDevice(A);
676: MatSeqAIJKokkosSyncDevice(B);
677: akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
678: bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
679: ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
683: csrmatA = &akok->csrmat;
684: csrmatB = &bkok->csrmat;
686: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
687: if (transA) {
688: MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);
689: transA = false;
690: }
692: if (transB) {
693: MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);
694: transB = false;
695: }
696: PetscLogGpuTimeBegin();
697: KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat);
698: KokkosKernels::sort_crs_matrix(ckok->csrmat); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
699: PetscLogGpuTimeEnd();
700: MatSeqAIJKokkosModifyDevice(C);
701: /* shorter version of MatAssemblyEnd_SeqAIJ */
702: c = (Mat_SeqAIJ*)C->data;
703: PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);
704: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
705: PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);
706: c->reallocs = 0;
707: C->info.mallocs = 0;
708: C->info.nz_unneeded = 0;
709: C->assembled = C->was_assembled = PETSC_TRUE;
710: C->num_ass++;
711: return 0;
712: }
714: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
715: {
716: Mat_Product *product = C->product;
717: MatProductType ptype;
718: Mat A,B;
719: bool transA,transB;
720: Mat_SeqAIJKokkos *akok,*bkok,*ckok;
721: MatProductData_SeqAIJKokkos *pdata;
722: MPI_Comm comm;
723: KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC;
725: MatCheckProduct(C,1);
726: PetscObjectGetComm((PetscObject)C,&comm);
728: A = product->A;
729: B = product->B;
730: MatSeqAIJKokkosSyncDevice(A);
731: MatSeqAIJKokkosSyncDevice(B);
732: akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
733: bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
734: csrmatA = &akok->csrmat;
735: csrmatB = &bkok->csrmat;
737: ptype = product->type;
738: switch (ptype) {
739: case MATPRODUCT_AB: transA = false; transB = false; break;
740: case MATPRODUCT_AtB: transA = true; transB = false; break;
741: case MATPRODUCT_ABt: transA = false; transB = true; break;
742: default:
743: SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
744: }
746: product->data = pdata = new MatProductData_SeqAIJKokkos();
747: pdata->kh.set_team_work_size(16);
748: pdata->kh.set_dynamic_scheduling(true);
749: pdata->reusesym = product->api_user;
751: /* TODO: add command line options to select spgemm algorithms */
752: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
753: /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
754: We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
755: */
756: pdata->kh.create_spgemm_handle(spgemm_alg);
758: PetscLogGpuTimeBegin();
759: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
760: if (transA) {
761: MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);
762: transA = false;
763: }
765: if (transB) {
766: MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);
767: transB = false;
768: }
770: KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC);
771: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
772: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
773: calling new Mat_SeqAIJKokkos().
774: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
775: */
776: KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC);
777: KokkosKernels::sort_crs_matrix(csrmatC);
778: PetscLogGpuTimeEnd();
780: ckok = new Mat_SeqAIJKokkos(csrmatC);
781: MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);
782: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
783: return 0;
784: }
786: /* handles sparse matrix matrix ops */
787: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
788: {
789: Mat_Product *product = mat->product;
790: PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
792: MatCheckProduct(mat,1);
793: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);
794: if (product->type == MATPRODUCT_ABC) {
795: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);
796: }
797: if (Biskok && Ciskok) {
798: switch (product->type) {
799: case MATPRODUCT_AB:
800: case MATPRODUCT_AtB:
801: case MATPRODUCT_ABt:
802: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
803: break;
804: case MATPRODUCT_PtAP:
805: case MATPRODUCT_RARt:
806: case MATPRODUCT_ABC:
807: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
808: break;
809: default:
810: break;
811: }
812: } else { /* fallback for AIJ */
813: MatProductSetFromOptions_SeqAIJ(mat);
814: }
815: return 0;
816: }
818: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
819: {
820: Mat_SeqAIJKokkos *aijkok;
822: PetscLogGpuTimeBegin();
823: MatSeqAIJKokkosSyncDevice(A);
824: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
825: KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
826: MatSeqAIJKokkosModifyDevice(A);
827: PetscLogGpuFlops(aijkok->a_dual.extent(0));
828: PetscLogGpuTimeEnd();
829: return 0;
830: }
832: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
833: {
834: Mat_SeqAIJKokkos *aijkok;
836: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
837: if (aijkok) { /* Only zero the device if data is already there */
838: KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
839: MatSeqAIJKokkosModifyDevice(A);
840: } else { /* Might be preallocated but not assembled */
841: MatZeroEntries_SeqAIJ(A);
842: }
843: return 0;
844: }
846: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
847: PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
848: {
849: Mat_SeqAIJKokkos *aijkok;
854: MatSeqAIJKokkosSyncDevice(A);
855: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
856: *kv = aijkok->a_dual.view_device();
857: return 0;
858: }
860: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
861: {
865: return 0;
866: }
868: PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
869: {
870: Mat_SeqAIJKokkos *aijkok;
875: MatSeqAIJKokkosSyncDevice(A);
876: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
877: *kv = aijkok->a_dual.view_device();
878: return 0;
879: }
881: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
882: {
886: MatSeqAIJKokkosModifyDevice(A);
887: return 0;
888: }
890: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
891: {
892: Mat_SeqAIJKokkos *aijkok;
897: aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
898: *kv = aijkok->a_dual.view_device();
899: return 0;
900: }
902: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
903: {
907: MatSeqAIJKokkosModifyDevice(A);
908: return 0;
909: }
911: /* Computes Y += alpha X */
912: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
913: {
914: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
915: Mat_SeqAIJKokkos *xkok,*ykok,*zkok;
916: ConstMatScalarKokkosView Xa;
917: MatScalarKokkosView Ya;
921: MatSeqAIJKokkosSyncDevice(Y);
922: MatSeqAIJKokkosSyncDevice(X);
923: PetscLogGpuTimeBegin();
925: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
926: /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
927: PetscBool e;
928: PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);
929: if (e) {
930: PetscArraycmp(x->j,y->j,y->nz,&e);
931: if (e) pattern = SAME_NONZERO_PATTERN;
932: }
933: }
935: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
936: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
937: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
938: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
939: */
940: ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
941: xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
942: Xa = xkok->a_dual.view_device();
943: Ya = ykok->a_dual.view_device();
945: if (pattern == SAME_NONZERO_PATTERN) {
946: KokkosBlas::axpy(alpha,Xa,Ya);
947: MatSeqAIJKokkosModifyDevice(Y);
948: } else if (pattern == SUBSET_NONZERO_PATTERN) {
949: MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
950: MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
952: Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
953: PetscInt i = t.league_rank(); /* row i */
954: Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
955: PetscInt p,q = Yi(i);
956: for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
957: while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
958: if (Xj(p) == Yj(q)) { /* Found it */
959: Ya(q) += alpha * Xa(p);
960: q++;
961: } else {
962: /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
963: Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
964: */
965: if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
966: }
967: }
968: });
969: });
970: MatSeqAIJKokkosModifyDevice(Y);
971: } else { /* different nonzero patterns */
972: Mat Z;
973: KokkosCsrMatrix zcsr;
974: KernelHandle kh;
975: kh.create_spadd_handle(false);
976: KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
977: KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
978: zkok = new Mat_SeqAIJKokkos(zcsr);
979: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);
980: MatHeaderReplace(Y,&Z);
981: kh.destroy_spadd_handle();
982: }
983: PetscLogGpuTimeEnd();
984: PetscLogGpuFlops(xkok->a_dual.extent(0)*2); /* Because we scaled X and then added it to Y */
985: return 0;
986: }
988: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
989: {
990: Mat_SeqAIJKokkos *akok;
991: Mat_SeqAIJ *aseq;
993: MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);
994: aseq = static_cast<Mat_SeqAIJ*>(mat->data);
995: akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
996: delete akok;
997: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
998: MatZeroEntries_SeqAIJKokkos(mat);
999: akok->SetUpCOO(aseq);
1000: return 0;
1001: }
1003: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1004: {
1005: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1006: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1007: PetscCount Annz = aseq->nz;
1008: const PetscCountKokkosView& jmap = akok->jmap_d;
1009: const PetscCountKokkosView& perm = akok->perm_d;
1010: MatScalarKokkosView Aa;
1011: ConstMatScalarKokkosView kv;
1012: PetscMemType memtype;
1014: PetscGetMemType(v,&memtype);
1015: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1016: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1017: } else {
1018: kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1019: }
1021: if (imode == INSERT_VALUES) {
1022: MatSeqAIJGetKokkosViewWrite(A,&Aa); /* write matrix values */
1023: Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1024: } else MatSeqAIJGetKokkosView(A,&Aa); /* read & write matrix values */
1026: Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1028: if (imode == INSERT_VALUES) MatSeqAIJRestoreKokkosViewWrite(A,&Aa);
1029: else MatSeqAIJRestoreKokkosView(A,&Aa);
1030: return 0;
1031: }
1033: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1034: {
1035: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1036: MatScalarKokkosView Aa;
1037: const MatRowMapKokkosView& Ai = akok->i_dual.view_device();
1038: PetscInt m = A->rmap->n;
1039: ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */
1041: MatSeqAIJGetKokkosViewWrite(A,&Aa);
1042: Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1043: PetscScalar tmp;
1044: if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1045: tmp = Aa(Ai(i));
1046: Aa(Ai(i)) = Aa(Adiag(i));
1047: Aa(Adiag(i)) = tmp;
1048: }
1049: });
1050: MatSeqAIJRestoreKokkosViewWrite(A,&Aa);
1051: return 0;
1052: }
1054: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1055: {
1056: MatSeqAIJKokkosSyncHost(A);
1057: MatLUFactorNumeric_SeqAIJ(B,A,info);
1058: B->offloadmask = PETSC_OFFLOAD_CPU;
1059: return 0;
1060: }
1062: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1063: {
1064: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1066: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1067: A->boundtocpu = PETSC_FALSE;
1069: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1070: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1071: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1072: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1073: A->ops->scale = MatScale_SeqAIJKokkos;
1074: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1075: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1076: A->ops->mult = MatMult_SeqAIJKokkos;
1077: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1078: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1079: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1080: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1081: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1082: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1083: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1084: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1085: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1086: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1087: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1088: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1089: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1090: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1091: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1093: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);
1094: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);
1095: return 0;
1096: }
1098: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1099: {
1100: Mat_SeqAIJ *aseq;
1101: PetscInt i,m,n;
1105: m = akok->nrows();
1106: n = akok->ncols();
1107: MatSetSizes(A,m,n,m,n);
1108: MatSetType(A,MATSEQAIJKOKKOS);
1110: /* Set up data structures of A as a MATSEQAIJ */
1111: MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);
1112: aseq = (Mat_SeqAIJ*)(A)->data;
1114: akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1115: akok->j_dual.sync_host();
1117: aseq->i = akok->i_host_data();
1118: aseq->j = akok->j_host_data();
1119: aseq->a = akok->a_host_data();
1120: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1121: aseq->singlemalloc = PETSC_FALSE;
1122: aseq->free_a = PETSC_FALSE;
1123: aseq->free_ij = PETSC_FALSE;
1124: aseq->nz = akok->nnz();
1125: aseq->maxnz = aseq->nz;
1127: PetscMalloc1(m,&aseq->imax);
1128: PetscMalloc1(m,&aseq->ilen);
1129: for (i=0; i<m; i++) {
1130: aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1131: }
1133: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1134: akok->nonzerostate = A->nonzerostate;
1135: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1136: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1137: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1138: return 0;
1139: }
1141: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1143: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1144: */
1145: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1146: {
1147: MatCreate(comm,A);
1148: MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);
1149: return 0;
1150: }
1152: /* --------------------------------------------------------------------------------*/
1153: /*@C
1154: MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1155: (the default parallel PETSc format). This matrix will ultimately be handled by
1156: Kokkos for calculations. For good matrix
1157: assembly performance the user should preallocate the matrix storage by setting
1158: the parameter nz (or the array nnz). By setting these parameters accurately,
1159: performance during matrix assembly can be increased by more than a factor of 50.
1161: Collective
1163: Input Parameters:
1164: + comm - MPI communicator, set to PETSC_COMM_SELF
1165: . m - number of rows
1166: . n - number of columns
1167: . nz - number of nonzeros per row (same for all rows)
1168: - nnz - array containing the number of nonzeros in the various rows
1169: (possibly different for each row) or NULL
1171: Output Parameter:
1172: . A - the matrix
1174: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1175: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1176: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1178: Notes:
1179: If nnz is given then nz is ignored
1181: The AIJ format (also called the Yale sparse matrix format or
1182: compressed row storage), is fully compatible with standard Fortran 77
1183: storage. That is, the stored row and column indices can begin at
1184: either one (as in Fortran) or zero. See the users' manual for details.
1186: Specify the preallocated storage with either nz or nnz (not both).
1187: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1188: allocation. For large problems you MUST preallocate memory or you
1189: will get TERRIBLE performance, see the users' manual chapter on matrices.
1191: By default, this format uses inodes (identical nodes) when possible, to
1192: improve numerical efficiency of matrix-vector products and solves. We
1193: search for consecutive rows with the same nonzero structure, thereby
1194: reusing matrix information to achieve increased efficiency.
1196: Level: intermediate
1198: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1199: @*/
1200: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1201: {
1202: PetscKokkosInitializeCheck();
1203: MatCreate(comm,A);
1204: MatSetSizes(*A,m,n,m,n);
1205: MatSetType(*A,MATSEQAIJKOKKOS);
1206: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1207: return 0;
1208: }
1210: typedef Kokkos::TeamPolicy<>::member_type team_member;
1211: //
1212: // This factorization exploits block diagonal matrices with "Nf" (not used).
1213: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1214: //
1215: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1216: {
1217: Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data;
1218: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1219: IS isrow = b->row,isicol = b->icol;
1220: const PetscInt *r_h,*ic_h;
1221: const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1222: const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1223: PetscScalar *ba_d = baijkok->a_dual.view_device().data();
1224: PetscBool row_identity,col_identity;
1225: PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1228: MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);
1230: ISGetIndices(isrow,&r_h);
1231: ISGetIndices(isicol,&ic_h);
1232: ISGetSize(isicol,&nc);
1233: PetscLogGpuTimeBegin();
1234: MatSeqAIJKokkosSyncDevice(A);
1235: {
1236: #define KOKKOS_SHARED_LEVEL 1
1237: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1238: using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1239: using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1240: const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1241: Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1242: const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1243: Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1244: size_t flops_h = 0.0;
1245: Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1246: Kokkos::View<size_t> d_flops_k ("flops");
1247: const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1248: const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1249: Kokkos::deep_copy (d_flops_k, h_flops_k);
1250: Kokkos::deep_copy (d_r_k, h_r_k);
1251: Kokkos::deep_copy (d_ic_k, h_ic_k);
1252: // Fill A --> fact
1253: Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1254: const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1255: const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1256: const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1257: // zero rows of B
1258: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1259: PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1260: PetscScalar *baL = ba_d + bi_d[rowb];
1261: PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1262: /* zero (unfactored row) */
1263: for (int j=0;j<nzbL;j++) baL[j] = 0;
1264: for (int j=0;j<nzbU;j++) baU[j] = 0;
1265: });
1266: // copy A into B
1267: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1268: PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1269: const PetscScalar *av = aa_d + ai_d[rowa];
1270: const PetscInt *ajtmp = aj_d + ai_d[rowa];
1271: /* load in initial (unfactored row) */
1272: for (int j=0;j<nza;j++) {
1273: PetscInt colb = ic[ajtmp[j]];
1274: PetscScalar vala = av[j];
1275: if (colb == rowb) {
1276: *(ba_d + bdiag_d[rowb]) = vala;
1277: } else {
1278: const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1279: PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1280: PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1281: for (int j=0; j<nz ; j++) {
1282: if (pbj[j] == colb) {
1283: pba[j] = vala;
1284: set++;
1285: break;
1286: }
1287: }
1288: #if !defined(PETSC_HAVE_SYCL)
1289: if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1290: #endif
1291: }
1292: }
1293: });
1294: });
1295: Kokkos::fence();
1297: Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1298: sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1299: scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1300: sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1301: const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1302: const PetscInt start = field*nloc, end = start + nloc;
1303: Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1304: // A22 panel update for each row A(1,:) and col A(:,1)
1305: for (int ii=start; ii<end-1; ii++) {
1306: const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1307: const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end)
1308: const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni);
1309: const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1310: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1311: PetscInt kIdx = j*Ni + field_block_idx;
1312: if (kIdx >= nzUi) /* void */ ;
1313: else {
1314: const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1315: const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1316: const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1317: size_t st_idx;
1318: // find and do L(k,i) = A(:k,i) / A(i,i)
1319: Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1320: // get column, there has got to be a better way
1321: Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1322: if (pjL[j] == ii) {
1323: PetscScalar *pLki = ba_d + bi_d[myk] + j;
1324: idx = j; // output
1325: *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i)
1326: }
1327: }, st_idx);
1328: Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1329: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1330: if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1331: #endif
1332: // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i
1333: // U(i+1,:end)
1334: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1335: PetscScalar Uij = baUi[uiIdx];
1336: PetscInt col = bjUi[uiIdx];
1337: if (col==myk) {
1338: // A_kk = A_kk - L_ki * U_ij(k)
1339: PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1340: *Akkv = *Akkv - L_ki() * Uij; // UiK
1341: } else {
1342: PetscScalar *start, *end, *pAkjv=NULL;
1343: PetscInt high, low;
1344: const PetscInt *startj;
1345: if (col<myk) { // L
1346: PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1347: PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1348: start = pLki+1; // start at pLki+1, A22(myk,1)
1349: startj= bj_d + bi_d[myk] + idx;
1350: end = ba_d + bi_d[myk+1];
1351: } else {
1352: PetscInt idx = bdiag_d[myk+1]+1;
1353: start = ba_d + idx;
1354: startj= bj_d + idx;
1355: end = ba_d + bdiag_d[myk];
1356: }
1357: // search for 'col', use bisection search - TODO
1358: low = 0;
1359: high = (PetscInt)(end-start);
1360: while (high-low > 5) {
1361: int t = (low+high)/2;
1362: if (startj[t] > col) high = t;
1363: else low = t;
1364: }
1365: for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1366: if (startj[pAkjv-start] == col) break;
1367: }
1368: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1369: if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1370: #endif
1371: *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1372: }
1373: });
1374: }
1375: });
1376: team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1377: if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1378: } /* endof for (i=0; i<n; i++) { */
1379: Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1380: });
1381: Kokkos::fence();
1382: Kokkos::deep_copy (h_flops_k, d_flops_k);
1383: PetscLogGpuFlops((PetscLogDouble)h_flops_k());
1384: Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1385: const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1386: const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1387: /* Invert diagonal for simpler triangular solves */
1388: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1389: int i = start + outer_index*Ni + lg_rank%Ni;
1390: if (i < end) {
1391: PetscScalar *pv = ba_d + bdiag_d[i];
1392: *pv = 1.0/(*pv);
1393: }
1394: });
1395: });
1396: }
1397: PetscLogGpuTimeEnd();
1398: ISRestoreIndices(isicol,&ic_h);
1399: ISRestoreIndices(isrow,&r_h);
1401: ISIdentity(isrow,&row_identity);
1402: ISIdentity(isicol,&col_identity);
1403: if (b->inode.size) {
1404: B->ops->solve = MatSolve_SeqAIJ_Inode;
1405: } else if (row_identity && col_identity) {
1406: B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1407: } else {
1408: B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1409: }
1410: B->offloadmask = PETSC_OFFLOAD_GPU;
1411: MatSeqAIJKokkosSyncHost(B); // solve on CPU
1412: B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this
1413: B->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1414: B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1415: B->ops->matsolve = MatMatSolve_SeqAIJ;
1416: B->assembled = PETSC_TRUE;
1417: B->preallocated = PETSC_TRUE;
1419: return 0;
1420: }
1422: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1423: {
1424: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
1425: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1426: return 0;
1427: }
1429: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1430: {
1431: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1433: if (!factors->sptrsv_symbolic_completed) {
1434: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1435: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1436: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1437: }
1438: return 0;
1439: }
1441: /* Check if we need to update factors etc for transpose solve */
1442: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1443: {
1444: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1445: MatColIdxType n = A->rmap->n;
1447: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1448: /* Update L^T and do sptrsv symbolic */
1449: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1450: Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1451: factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1452: factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1454: KokkosKernels::Impl::transpose_matrix<
1455: ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1456: MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1457: MatRowMapKokkosView,DefaultExecutionSpace>(
1458: n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1459: factors->iLt_d,factors->jLt_d,factors->aLt_d);
1461: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1462: We have to sort the indices, until KK provides finer control options.
1463: */
1464: KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1465: MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1466: factors->iLt_d,factors->jLt_d,factors->aLt_d);
1468: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1470: /* Update U^T and do sptrsv symbolic */
1471: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1472: Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1473: factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1474: factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1476: KokkosKernels::Impl::transpose_matrix<
1477: ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1478: MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1479: MatRowMapKokkosView,DefaultExecutionSpace>(
1480: n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1481: factors->iUt_d,factors->jUt_d,factors->aUt_d);
1483: /* Sort indices. See comments above */
1484: KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1485: MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1486: factors->iUt_d,factors->jUt_d,factors->aUt_d);
1488: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1489: factors->transpose_updated = PETSC_TRUE;
1490: }
1491: return 0;
1492: }
1494: /* Solve Ax = b, with A = LU */
1495: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1496: {
1497: ConstPetscScalarKokkosView bv;
1498: PetscScalarKokkosView xv;
1499: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1501: PetscLogGpuTimeBegin();
1502: MatSeqAIJKokkosSymbolicSolveCheck(A);
1503: VecGetKokkosView(b,&bv);
1504: VecGetKokkosViewWrite(x,&xv);
1505: /* Solve L tmpv = b */
1506: KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
1507: /* Solve Ux = tmpv */
1508: KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
1509: VecRestoreKokkosView(b,&bv);
1510: VecRestoreKokkosViewWrite(x,&xv);
1511: PetscLogGpuTimeEnd();
1512: return 0;
1513: }
1515: /* Solve A^T x = b, where A^T = U^T L^T */
1516: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1517: {
1518: ConstPetscScalarKokkosView bv;
1519: PetscScalarKokkosView xv;
1520: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1522: PetscLogGpuTimeBegin();
1523: MatSeqAIJKokkosTransposeSolveCheck(A);
1524: VecGetKokkosView(b,&bv);
1525: VecGetKokkosViewWrite(x,&xv);
1526: /* Solve U^T tmpv = b */
1527: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1529: /* Solve L^T x = tmpv */
1530: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1531: VecRestoreKokkosView(b,&bv);
1532: VecRestoreKokkosViewWrite(x,&xv);
1533: PetscLogGpuTimeEnd();
1534: return 0;
1535: }
1537: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1538: {
1539: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1540: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1541: PetscInt fill_lev = info->levels;
1543: PetscLogGpuTimeBegin();
1544: MatSeqAIJKokkosSyncDevice(A);
1546: auto a_d = aijkok->a_dual.view_device();
1547: auto i_d = aijkok->i_dual.view_device();
1548: auto j_d = aijkok->j_dual.view_device();
1550: KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1552: B->assembled = PETSC_TRUE;
1553: B->preallocated = PETSC_TRUE;
1554: B->ops->solve = MatSolve_SeqAIJKokkos;
1555: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1556: B->ops->matsolve = NULL;
1557: B->ops->matsolvetranspose = NULL;
1558: B->offloadmask = PETSC_OFFLOAD_GPU;
1560: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1561: factors->transpose_updated = PETSC_FALSE;
1562: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1563: /* TODO: log flops, but how to know that? */
1564: PetscLogGpuTimeEnd();
1565: return 0;
1566: }
1568: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1569: {
1570: Mat_SeqAIJKokkos *aijkok;
1571: Mat_SeqAIJ *b;
1572: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1573: PetscInt fill_lev = info->levels;
1574: PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1575: PetscInt n = A->rmap->n;
1577: MatSeqAIJKokkosSyncDevice(A);
1578: /* Rebuild factors */
1579: if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1580: else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1582: /* Create a spiluk handle and then do symbolic factorization */
1583: nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1584: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1586: auto spiluk_handle = factors->kh.get_spiluk_handle();
1588: Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1589: Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1590: Kokkos::realloc(factors->iU_d,n+1);
1591: Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1593: aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1594: auto i_d = aijkok->i_dual.view_device();
1595: auto j_d = aijkok->j_dual.view_device();
1596: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1597: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1599: Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1600: Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1601: Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1602: Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1604: /* TODO: add options to select sptrsv algorithms */
1605: /* Create sptrsv handles for L, U and their transpose */
1606: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1607: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1608: #else
1609: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1610: #endif
1612: factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1613: factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1614: factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1615: factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1617: /* Fill fields of the factor matrix B */
1618: MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
1619: b = (Mat_SeqAIJ*)B->data;
1620: b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1621: B->info.fill_ratio_given = info->fill;
1622: B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1624: B->offloadmask = PETSC_OFFLOAD_GPU;
1625: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1626: return 0;
1627: }
1629: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1630: {
1631: Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data;
1632: const PetscInt nrows = A->rmap->n;
1634: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
1635: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1636: // move B data into Kokkos
1637: MatSeqAIJKokkosSyncDevice(B); // create aijkok
1638: MatSeqAIJKokkosSyncDevice(A); // create aijkok
1639: {
1640: Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1641: if (!baijkok->diag_d.extent(0)) {
1642: const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1643: baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1644: Kokkos::deep_copy (baijkok->diag_d, h_diag);
1645: }
1646: }
1647: return 0;
1648: }
1650: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1651: {
1652: *type = MATSOLVERKOKKOS;
1653: return 0;
1654: }
1656: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1657: {
1658: *type = MATSOLVERKOKKOSDEVICE;
1659: return 0;
1660: }
1662: /*MC
1663: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1664: on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1666: Level: beginner
1668: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1669: M*/
1670: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1671: {
1672: PetscInt n = A->rmap->n;
1674: MatCreate(PetscObjectComm((PetscObject)A),B);
1675: MatSetSizes(*B,n,n,n,n);
1676: (*B)->factortype = ftype;
1677: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
1678: MatSetType(*B,MATSEQAIJKOKKOS);
1680: if (ftype == MAT_FACTOR_LU) {
1681: MatSetBlockSizesFromMats(*B,A,A);
1682: (*B)->canuseordering = PETSC_TRUE;
1683: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1684: } else if (ftype == MAT_FACTOR_ILU) {
1685: MatSetBlockSizesFromMats(*B,A,A);
1686: (*B)->canuseordering = PETSC_FALSE;
1687: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1688: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1690: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
1691: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);
1692: return 0;
1693: }
1695: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1696: {
1697: PetscInt n = A->rmap->n;
1699: MatCreate(PetscObjectComm((PetscObject)A),B);
1700: MatSetSizes(*B,n,n,n,n);
1701: (*B)->factortype = ftype;
1702: (*B)->canuseordering = PETSC_TRUE;
1703: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
1704: MatSetType(*B,MATSEQAIJKOKKOS);
1706: if (ftype == MAT_FACTOR_LU) {
1707: MatSetBlockSizesFromMats(*B,A,A);
1708: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1709: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1711: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
1712: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);
1713: return 0;
1714: }
1716: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1717: {
1718: MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);
1719: MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);
1720: MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);
1721: return 0;
1722: }
1724: /* Utility to print out a KokkosCsrMatrix for debugging */
1725: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1726: {
1727: const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1728: const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1729: const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1730: const PetscInt *i = iv.data();
1731: const PetscInt *j = jv.data();
1732: const PetscScalar *a = av.data();
1733: PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1735: PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);
1736: for (PetscInt k=0; k<m; k++) {
1737: PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);
1738: for (PetscInt p=i[k]; p<i[k+1]; p++) {
1739: PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);
1740: }
1741: PetscPrintf(PETSC_COMM_SELF,"\n");
1742: }
1743: return 0;
1744: }