Actual source code: aijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscpkg_version.h>
  3: #include <petsc/private/petscimpl.h>
  4: #include <petsc/private/sfimpl.h>
  5: #include <petscsystypes.h>
  6: #include <petscerror.h>

  8: #include <Kokkos_Core.hpp>
  9: #include <KokkosBlas.hpp>
 10: #include <KokkosKernels_Sorting.hpp>
 11: #include <KokkosSparse_CrsMatrix.hpp>
 12: #include <KokkosSparse_spmv.hpp>
 13: #include <KokkosSparse_spiluk.hpp>
 14: #include <KokkosSparse_sptrsv.hpp>
 15: #include <KokkosSparse_spgemm.hpp>
 16: #include <KokkosSparse_spadd.hpp>

 18: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 20: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 22: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 23:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 24:    In the latter case, it is important to set a_dual's sync state correctly.
 25:  */
 26: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
 27: {
 28:   Mat_SeqAIJ       *aijseq;
 29:   Mat_SeqAIJKokkos *aijkok;

 31:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
 32:   MatAssemblyEnd_SeqAIJ(A,mode);

 34:   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
 35:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

 37:   /* If aijkok does not exist, we just copy i, j to device.
 38:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 39:      In both cases, we build a new aijkok structure.
 40:   */
 41:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 42:     delete aijkok;
 43:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
 44:     A->spptr = aijkok;
 45:   }

 47:   if (aijkok && aijkok->device_mat_d.data()) {
 48:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 49:   }
 50:   return 0;
 51: }

 53: /* Sync CSR data to device if not yet */
 54: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 55: {
 56:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

 60:   if (aijkok->a_dual.need_sync_device()) {
 61:     aijkok->a_dual.sync_device();
 62:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 63:     aijkok->hermitian_updated = PETSC_FALSE;
 64:   }
 65:   return 0;
 66: }

 68: /* Mark the CSR data on device as modified */
 69: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 70: {
 71:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

 74:   aijkok->a_dual.clear_sync_state();
 75:   aijkok->a_dual.modify_device();
 76:   aijkok->transpose_updated = PETSC_FALSE;
 77:   aijkok->hermitian_updated = PETSC_FALSE;
 78:   MatSeqAIJInvalidateDiagonal(A);
 79:   PetscObjectStateIncrease((PetscObject)A);
 80:   return 0;
 81: }

 83: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
 84: {
 85:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

 88:   /* We do not expect one needs factors on host  */
 91:   aijkok->a_dual.sync_host();
 92:   return 0;
 93: }

 95: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
 96: {
 97:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

 99:   if (aijkok) {
100:     aijkok->a_dual.sync_host();
101:     *array = aijkok->a_dual.view_host().data();
102:   } else { /* Happens when calling MatSetValues on a newly created matrix */
103:     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
104:   }
105:   return 0;
106: }

108: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
109: {
110:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

112:   if (aijkok) aijkok->a_dual.modify_host();
113:   return 0;
114: }

116: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
117: {
118:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

120:   if (aijkok) {
121:     aijkok->a_dual.sync_host();
122:     *array = aijkok->a_dual.view_host().data();
123:   } else {
124:     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
125:   }
126:   return 0;
127: }

129: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
130: {
131:   *array = NULL;
132:   return 0;
133: }

135: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
136: {
137:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

139:   if (aijkok) {
140:     *array = aijkok->a_dual.view_host().data();
141:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
142:     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
143:   }
144:   return 0;
145: }

147: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148: {
149:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

151:   if (aijkok) {
152:     aijkok->a_dual.clear_sync_state();
153:     aijkok->a_dual.modify_host();
154:   }
155:   return 0;
156: }

158: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
159: {
160:   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);


164:   if (i) *i = aijkok->i_device_data();
165:   if (j) *j = aijkok->j_device_data();
166:   if (a) {
167:     aijkok->a_dual.sync_device();
168:     *a = aijkok->a_device_data();
169:   }
170:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
171:   return 0;
172: }

174: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
175: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
176: {
177:   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
178:   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);

181:   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
182:   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
183:   return 0;
184: }

186: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
187: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
188: {
189:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

191:   if (aijkok && aijkok->device_mat_d.data()) {
192:     *d_mat = aijkok->device_mat_d.data();
193:   } else {
194:     MatSeqAIJKokkosSyncDevice(A); // create aijkok (we are making d_mat now so make a place for it)
195:     *d_mat  = NULL;
196:   }
197:   return 0;
198: }

200: /* Generate the transpose on device and cache it internally */
201: static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202: {
203:   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

206:   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
207:     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
208:     aijkok->a_dual.sync_device();
209:     aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat);
210:     KokkosKernels::sort_crs_matrix(aijkok->csrmatT);
211:     aijkok->transpose_updated = PETSC_TRUE;
212:   }
213:   *csrmatT = &aijkok->csrmatT;
214:   return 0;
215: }

217: /* Generate the Hermitian on device and cache it internally */
218: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
219: {
220:   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

222:   PetscLogGpuTimeBegin();
224:   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
225:     aijkok->a_dual.sync_device();
226:     aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat);
227:     KokkosKernels::sort_crs_matrix(aijkok->csrmatH);
228:    #if defined(PETSC_USE_COMPLEX)
229:     const auto& a = aijkok->csrmatH.values;
230:     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
231:    #endif
232:     aijkok->hermitian_updated = PETSC_TRUE;
233:   }
234:   *csrmatH = &aijkok->csrmatH;
235:   PetscLogGpuTimeEnd();
236:   return 0;
237: }

239: /* y = A x */
240: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
241: {
242:   Mat_SeqAIJKokkos                 *aijkok;
243:   ConstPetscScalarKokkosView       xv;
244:   PetscScalarKokkosView            yv;

246:   PetscLogGpuTimeBegin();
247:   MatSeqAIJKokkosSyncDevice(A);
248:   VecGetKokkosView(xx,&xv);
249:   VecGetKokkosViewWrite(yy,&yv);
250:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
251:   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
252:   VecRestoreKokkosView(xx,&xv);
253:   VecRestoreKokkosViewWrite(yy,&yv);
254:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
255:   PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());
256:   PetscLogGpuTimeEnd();
257:   return 0;
258: }

260: /* y = A^T x */
261: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
262: {
263:   Mat_SeqAIJKokkos                 *aijkok;
264:   const char                       *mode;
265:   ConstPetscScalarKokkosView       xv;
266:   PetscScalarKokkosView            yv;
267:   KokkosCsrMatrix                  *csrmat;

269:   PetscLogGpuTimeBegin();
270:   MatSeqAIJKokkosSyncDevice(A);
271:   VecGetKokkosView(xx,&xv);
272:   VecGetKokkosViewWrite(yy,&yv);
273:   if (A->form_explicit_transpose) {
274:     MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);
275:     mode = "N";
276:   } else {
277:     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278:     csrmat = &aijkok->csrmat;
279:     mode = "T";
280:   }
281:   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
282:   VecRestoreKokkosView(xx,&xv);
283:   VecRestoreKokkosViewWrite(yy,&yv);
284:   PetscLogGpuFlops(2.0*csrmat->nnz());
285:   PetscLogGpuTimeEnd();
286:   return 0;
287: }

289: /* y = A^H x */
290: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
291: {
292:   Mat_SeqAIJKokkos                 *aijkok;
293:   const char                       *mode;
294:   ConstPetscScalarKokkosView       xv;
295:   PetscScalarKokkosView            yv;
296:   KokkosCsrMatrix                  *csrmat;

298:   PetscLogGpuTimeBegin();
299:   MatSeqAIJKokkosSyncDevice(A);
300:   VecGetKokkosView(xx,&xv);
301:   VecGetKokkosViewWrite(yy,&yv);
302:   if (A->form_explicit_transpose) {
303:     MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);
304:     mode = "N";
305:   } else {
306:     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
307:     csrmat = &aijkok->csrmat;
308:     mode = "C";
309:   }
310:   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
311:   VecRestoreKokkosView(xx,&xv);
312:   VecRestoreKokkosViewWrite(yy,&yv);
313:   PetscLogGpuFlops(2.0*csrmat->nnz());
314:   PetscLogGpuTimeEnd();
315:   return 0;
316: }

318: /* z = A x + y */
319: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
320: {
321:   Mat_SeqAIJKokkos                 *aijkok;
322:   ConstPetscScalarKokkosView       xv,yv;
323:   PetscScalarKokkosView            zv;

325:   PetscLogGpuTimeBegin();
326:   MatSeqAIJKokkosSyncDevice(A);
327:   VecGetKokkosView(xx,&xv);
328:   VecGetKokkosView(yy,&yv);
329:   VecGetKokkosViewWrite(zz,&zv);
330:   if (zz != yy) Kokkos::deep_copy(zv,yv);
331:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
332:   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
333:   VecRestoreKokkosView(xx,&xv);
334:   VecRestoreKokkosView(yy,&yv);
335:   VecRestoreKokkosViewWrite(zz,&zv);
336:   PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());
337:   PetscLogGpuTimeEnd();
338:   return 0;
339: }

341: /* z = A^T x + y */
342: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
343: {
344:   Mat_SeqAIJKokkos                 *aijkok;
345:   const char                       *mode;
346:   ConstPetscScalarKokkosView       xv,yv;
347:   PetscScalarKokkosView            zv;
348:   KokkosCsrMatrix                  *csrmat;

350:   PetscLogGpuTimeBegin();
351:   MatSeqAIJKokkosSyncDevice(A);
352:   VecGetKokkosView(xx,&xv);
353:   VecGetKokkosView(yy,&yv);
354:   VecGetKokkosViewWrite(zz,&zv);
355:   if (zz != yy) Kokkos::deep_copy(zv,yv);
356:   if (A->form_explicit_transpose) {
357:     MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);
358:     mode = "N";
359:   } else {
360:     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
361:     csrmat = &aijkok->csrmat;
362:     mode = "T";
363:   }
364:   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
365:   VecRestoreKokkosView(xx,&xv);
366:   VecRestoreKokkosView(yy,&yv);
367:   VecRestoreKokkosViewWrite(zz,&zv);
368:   PetscLogGpuFlops(2.0*csrmat->nnz());
369:   PetscLogGpuTimeEnd();
370:   return 0;
371: }

373: /* z = A^H x + y */
374: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
375: {
376:   Mat_SeqAIJKokkos                 *aijkok;
377:   const char                       *mode;
378:   ConstPetscScalarKokkosView       xv,yv;
379:   PetscScalarKokkosView            zv;
380:   KokkosCsrMatrix                  *csrmat;

382:   PetscLogGpuTimeBegin();
383:   MatSeqAIJKokkosSyncDevice(A);
384:   VecGetKokkosView(xx,&xv);
385:   VecGetKokkosView(yy,&yv);
386:   VecGetKokkosViewWrite(zz,&zv);
387:   if (zz != yy) Kokkos::deep_copy(zv,yv);
388:   if (A->form_explicit_transpose) {
389:     MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);
390:     mode = "N";
391:   } else {
392:     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
393:     csrmat = &aijkok->csrmat;
394:     mode = "C";
395:   }
396:   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
397:   VecRestoreKokkosView(xx,&xv);
398:   VecRestoreKokkosView(yy,&yv);
399:   VecRestoreKokkosViewWrite(zz,&zv);
400:   PetscLogGpuFlops(2.0*csrmat->nnz());
401:   PetscLogGpuTimeEnd();
402:   return 0;
403: }

405: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
406: {
407:   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

409:   switch (op) {
410:     case MAT_FORM_EXPLICIT_TRANSPOSE:
411:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
412:       if (A->form_explicit_transpose && !flg && aijkok) aijkok->DestroyMatTranspose();
413:       A->form_explicit_transpose = flg;
414:       break;
415:     default:
416:       MatSetOption_SeqAIJ(A,op,flg);
417:       break;
418:   }
419:   return 0;
420: }

422: /* Depending on reuse, either build a new mat, or use the existing mat */
423: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
424: {
425:   Mat_SeqAIJ       *aseq;

427:   PetscKokkosInitializeCheck();
428:   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
429:     MatDuplicate(A,MAT_COPY_VALUES,newmat); /* the returned newmat is a SeqAIJKokkos */
430:   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
431:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN); /* newmat is already a SeqAIJKokkos */
432:   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
434:     PetscFree(A->defaultvectype);
435:     PetscStrallocpy(VECKOKKOS,&A->defaultvectype); /* Allocate and copy the string */
436:     PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);
437:     MatSetOps_SeqAIJKokkos(A);
438:     aseq = static_cast<Mat_SeqAIJ*>(A->data);
439:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
441:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
442:     }
443:   }
444:   return 0;
445: }

447: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
448:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
449:  */
450: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
451: {
452:   Mat_SeqAIJ            *bseq;
453:   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454:   Mat                   mat;

456:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
457:   MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
458:   mat  = *B;
459:   if (A->assembled) {
460:     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
461:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
462:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
463:     /* Now copy values to B if needed */
464:     if (dupOption == MAT_COPY_VALUES) {
465:       if (akok->a_dual.need_sync_device()) {
466:         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
467:         bkok->a_dual.modify_host();
468:       } else { /* If device has the latest data, we only copy data on device */
469:         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
470:         bkok->a_dual.modify_device();
471:       }
472:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
473:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
474:       bkok->a_dual.modify_host();
475:     }
476:     mat->spptr = bkok;
477:   }

479:   PetscFree(mat->defaultvectype);
480:   PetscStrallocpy(VECKOKKOS,&mat->defaultvectype); /* Allocate and copy the string */
481:   PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);
482:   MatSetOps_SeqAIJKokkos(mat);
483:   return 0;
484: }

486: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
487: {
488:   Mat               At;
489:   KokkosCsrMatrix   *internT;
490:   Mat_SeqAIJKokkos  *atkok,*bkok;

492:   MatSeqAIJKokkosGenerateTranspose_Private(A,&internT); /* Generate a transpose internally */
493:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
494:     /* Deep copy internT, as we want to isolate the internal transpose */
495:     atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT));
496:     MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);
497:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
498:     else MatHeaderReplace(A,&At); /* Replace A with At inplace */
499:   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
500:     if ((*B)->assembled) {
501:       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
502:       Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values);
503:       MatSeqAIJKokkosModifyDevice(*B);
504:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
505:       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
506:       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
507:       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
508:       Kokkos::deep_copy(a_h,internT->values);
509:       Kokkos::deep_copy(j_h,internT->graph.entries);
510:     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
511:   }
512:   return 0;
513: }

515: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
516: {
517:   Mat_SeqAIJKokkos           *aijkok;

519:   if (A->factortype == MAT_FACTOR_NONE) {
520:     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
521:     delete aijkok;
522:   } else {
523:     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
524:   }
525:   A->spptr = NULL;
526:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
527:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
528:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
529:   MatDestroy_SeqAIJ(A);
530:   return 0;
531: }

533: /*MC
534:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

536:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

538:    Options Database Keys:
539: .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()

541:   Level: beginner

543: .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
544: M*/
545: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
546: {
547:   PetscKokkosInitializeCheck();
548:   MatCreate_SeqAIJ(A);
549:   MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);
550:   return 0;
551: }

553: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
554: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
555: {
556:   Mat_SeqAIJ                   *a,*b;
557:   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
558:   MatScalarKokkosView          aa,ba,ca;
559:   MatRowMapKokkosView          ai,bi,ci;
560:   MatColIdxKokkosView          aj,bj,cj;
561:   PetscInt                     m,n,nnz,aN;


571:   MatSeqAIJKokkosSyncDevice(A);
572:   MatSeqAIJKokkosSyncDevice(B);
573:   a    = static_cast<Mat_SeqAIJ*>(A->data);
574:   b    = static_cast<Mat_SeqAIJ*>(B->data);
575:   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
576:   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
577:   aa   = akok->a_dual.view_device();
578:   ai   = akok->i_dual.view_device();
579:   ba   = bkok->a_dual.view_device();
580:   bi   = bkok->i_dual.view_device();
581:   m    = A->rmap->n; /* M, N and nnz of C */
582:   n    = A->cmap->n + B->cmap->n;
583:   nnz  = a->nz + b->nz;
584:   aN   = A->cmap->n; /* N of A */
585:   if (reuse == MAT_INITIAL_MATRIX) {
586:     aj = akok->j_dual.view_device();
587:     bj = bkok->j_dual.view_device();
588:     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
589:     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
590:     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
591:     ca = ca_dual.view_device();
592:     ci = ci_dual.view_device();
593:     cj = cj_dual.view_device();

595:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
596:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
597:       PetscInt i = t.league_rank(); /* row i */
598:       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);

600:       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
601:         ci(i) = coffset;
602:         if (i == m-1) ci(m) = ai(m) + bi(m);
603:       });

605:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
606:         if (k < alen) {
607:           ca(coffset+k) = aa(ai(i)+k);
608:           cj(coffset+k) = aj(ai(i)+k);
609:         } else {
610:           ca(coffset+k) = ba(bi(i)+k-alen);
611:           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
612:         }
613:       });
614:     });
615:     ca_dual.modify_device();
616:     ci_dual.modify_device();
617:     cj_dual.modify_device();
618:     ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual);
619:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);
620:   } else if (reuse == MAT_REUSE_MATRIX) {
623:     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
624:     ca   = ckok->a_dual.view_device();
625:     ci   = ckok->i_dual.view_device();

627:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
628:       PetscInt i = t.league_rank(); /* row i */
629:       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
630:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
631:         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
632:         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
633:       });
634:     });
635:     MatSeqAIJKokkosModifyDevice(*C);
636:   }
637:   return 0;
638: }

640: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
641: {
642:   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
643:   return 0;
644: }

646: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
647: {
648:   Mat_Product                    *product = C->product;
649:   Mat                            A,B;
650:   bool                           transA,transB; /* use bool, since KK needs this type */
651:   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
652:   Mat_SeqAIJ                     *c;
653:   MatProductData_SeqAIJKokkos    *pdata;
654:   KokkosCsrMatrix                *csrmatA,*csrmatB;

656:   MatCheckProduct(C,1);
658:   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);

660:   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
661:     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
662:     return 0;
663:   }

665:   switch (product->type) {
666:     case MATPRODUCT_AB:  transA = false; transB = false; break;
667:     case MATPRODUCT_AtB: transA = true;  transB = false; break;
668:     case MATPRODUCT_ABt: transA = false; transB = true;  break;
669:     default:
670:       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
671:   }

673:   A     = product->A;
674:   B     = product->B;
675:   MatSeqAIJKokkosSyncDevice(A);
676:   MatSeqAIJKokkosSyncDevice(B);
677:   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
678:   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
679:   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);


683:   csrmatA = &akok->csrmat;
684:   csrmatB = &bkok->csrmat;

686:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
687:   if (transA) {
688:     MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);
689:     transA = false;
690:   }

692:   if (transB) {
693:     MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);
694:     transB = false;
695:   }
696:   PetscLogGpuTimeBegin();
697:   KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat);
698:   KokkosKernels::sort_crs_matrix(ckok->csrmat); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
699:   PetscLogGpuTimeEnd();
700:   MatSeqAIJKokkosModifyDevice(C);
701:   /* shorter version of MatAssemblyEnd_SeqAIJ */
702:   c = (Mat_SeqAIJ*)C->data;
703:   PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);
704:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
705:   PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);
706:   c->reallocs         = 0;
707:   C->info.mallocs     = 0;
708:   C->info.nz_unneeded = 0;
709:   C->assembled        = C->was_assembled = PETSC_TRUE;
710:   C->num_ass++;
711:   return 0;
712: }

714: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
715: {
716:   Mat_Product                    *product = C->product;
717:   MatProductType                 ptype;
718:   Mat                            A,B;
719:   bool                           transA,transB;
720:   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
721:   MatProductData_SeqAIJKokkos    *pdata;
722:   MPI_Comm                       comm;
723:   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;

725:   MatCheckProduct(C,1);
726:   PetscObjectGetComm((PetscObject)C,&comm);
728:   A       = product->A;
729:   B       = product->B;
730:   MatSeqAIJKokkosSyncDevice(A);
731:   MatSeqAIJKokkosSyncDevice(B);
732:   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
733:   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
734:   csrmatA = &akok->csrmat;
735:   csrmatB = &bkok->csrmat;

737:   ptype   = product->type;
738:   switch (ptype) {
739:     case MATPRODUCT_AB:  transA = false; transB = false; break;
740:     case MATPRODUCT_AtB: transA = true;  transB = false; break;
741:     case MATPRODUCT_ABt: transA = false; transB = true;  break;
742:     default:
743:       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
744:   }

746:   product->data = pdata = new MatProductData_SeqAIJKokkos();
747:   pdata->kh.set_team_work_size(16);
748:   pdata->kh.set_dynamic_scheduling(true);
749:   pdata->reusesym = product->api_user;

751:   /* TODO: add command line options to select spgemm algorithms */
752:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
753:   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
754:      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
755:    */
756:   pdata->kh.create_spgemm_handle(spgemm_alg);

758:   PetscLogGpuTimeBegin();
759:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
760:   if (transA) {
761:     MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);
762:     transA = false;
763:   }

765:   if (transB) {
766:     MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);
767:     transB = false;
768:   }

770:   KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC);
771:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
772:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
773:     calling new Mat_SeqAIJKokkos().
774:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
775:   */
776:   KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC);
777:   KokkosKernels::sort_crs_matrix(csrmatC);
778:   PetscLogGpuTimeEnd();

780:   ckok = new Mat_SeqAIJKokkos(csrmatC);
781:   MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);
782:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
783:   return 0;
784: }

786: /* handles sparse matrix matrix ops */
787: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
788: {
789:   Mat_Product    *product = mat->product;
790:   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;

792:   MatCheckProduct(mat,1);
793:   PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);
794:   if (product->type == MATPRODUCT_ABC) {
795:     PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);
796:   }
797:   if (Biskok && Ciskok) {
798:     switch (product->type) {
799:     case MATPRODUCT_AB:
800:     case MATPRODUCT_AtB:
801:     case MATPRODUCT_ABt:
802:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
803:       break;
804:     case MATPRODUCT_PtAP:
805:     case MATPRODUCT_RARt:
806:     case MATPRODUCT_ABC:
807:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
808:       break;
809:     default:
810:       break;
811:     }
812:   } else { /* fallback for AIJ */
813:     MatProductSetFromOptions_SeqAIJ(mat);
814:   }
815:   return 0;
816: }

818: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
819: {
820:   Mat_SeqAIJKokkos *aijkok;

822:   PetscLogGpuTimeBegin();
823:   MatSeqAIJKokkosSyncDevice(A);
824:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
825:   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
826:   MatSeqAIJKokkosModifyDevice(A);
827:   PetscLogGpuFlops(aijkok->a_dual.extent(0));
828:   PetscLogGpuTimeEnd();
829:   return 0;
830: }

832: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
833: {
834:   Mat_SeqAIJKokkos *aijkok;

836:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
837:   if (aijkok) { /* Only zero the device if data is already there */
838:     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
839:     MatSeqAIJKokkosModifyDevice(A);
840:   } else { /* Might be preallocated but not assembled */
841:     MatZeroEntries_SeqAIJ(A);
842:   }
843:   return 0;
844: }

846: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
847: PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
848: {
849:   Mat_SeqAIJKokkos   *aijkok;

854:   MatSeqAIJKokkosSyncDevice(A);
855:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
856:   *kv    = aijkok->a_dual.view_device();
857:   return 0;
858: }

860: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
861: {
865:   return 0;
866: }

868: PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
869: {
870:   Mat_SeqAIJKokkos   *aijkok;

875:   MatSeqAIJKokkosSyncDevice(A);
876:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
877:   *kv    = aijkok->a_dual.view_device();
878:   return 0;
879: }

881: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
882: {
886:   MatSeqAIJKokkosModifyDevice(A);
887:   return 0;
888: }

890: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
891: {
892:   Mat_SeqAIJKokkos   *aijkok;

897:   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
898:   *kv    = aijkok->a_dual.view_device();
899:   return 0;
900: }

902: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
903: {
907:   MatSeqAIJKokkosModifyDevice(A);
908:   return 0;
909: }

911: /* Computes Y += alpha X */
912: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
913: {
914:   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
915:   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
916:   ConstMatScalarKokkosView   Xa;
917:   MatScalarKokkosView        Ya;

921:   MatSeqAIJKokkosSyncDevice(Y);
922:   MatSeqAIJKokkosSyncDevice(X);
923:   PetscLogGpuTimeBegin();

925:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
926:     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
927:     PetscBool e;
928:     PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);
929:     if (e) {
930:       PetscArraycmp(x->j,y->j,y->nz,&e);
931:       if (e) pattern = SAME_NONZERO_PATTERN;
932:     }
933:   }

935:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
936:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
937:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
938:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
939:   */
940:   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
941:   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
942:   Xa   = xkok->a_dual.view_device();
943:   Ya   = ykok->a_dual.view_device();

945:   if (pattern == SAME_NONZERO_PATTERN) {
946:     KokkosBlas::axpy(alpha,Xa,Ya);
947:     MatSeqAIJKokkosModifyDevice(Y);
948:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
949:     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
950:     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();

952:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
953:       PetscInt i = t.league_rank(); /* row i */
954:       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
955:         PetscInt p,q = Yi(i);
956:         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
957:           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
958:           if (Xj(p) == Yj(q)) { /* Found it */
959:             Ya(q) += alpha * Xa(p);
960:             q++;
961:           } else {
962:             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
963:                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
964:             */
965:             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
966:           }
967:         }
968:       });
969:     });
970:     MatSeqAIJKokkosModifyDevice(Y);
971:   } else { /* different nonzero patterns */
972:     Mat             Z;
973:     KokkosCsrMatrix zcsr;
974:     KernelHandle    kh;
975:     kh.create_spadd_handle(false);
976:     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
977:     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
978:     zkok = new Mat_SeqAIJKokkos(zcsr);
979:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);
980:     MatHeaderReplace(Y,&Z);
981:     kh.destroy_spadd_handle();
982:   }
983:   PetscLogGpuTimeEnd();
984:   PetscLogGpuFlops(xkok->a_dual.extent(0)*2); /* Because we scaled X and then added it to Y */
985:   return 0;
986: }

988: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
989: {
990:   Mat_SeqAIJKokkos *akok;
991:   Mat_SeqAIJ       *aseq;

993:   MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);
994:   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
995:   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
996:   delete akok;
997:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
998:   MatZeroEntries_SeqAIJKokkos(mat);
999:   akok->SetUpCOO(aseq);
1000:   return 0;
1001: }

1003: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1004: {
1005:   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1006:   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1007:   PetscCount                  Annz = aseq->nz;
1008:   const PetscCountKokkosView& jmap = akok->jmap_d;
1009:   const PetscCountKokkosView& perm = akok->perm_d;
1010:   MatScalarKokkosView         Aa;
1011:   ConstMatScalarKokkosView    kv;
1012:   PetscMemType                memtype;

1014:   PetscGetMemType(v,&memtype);
1015:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1016:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1017:   } else {
1018:     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1019:   }

1021:   if (imode == INSERT_VALUES) {
1022:     MatSeqAIJGetKokkosViewWrite(A,&Aa); /* write matrix values */
1023:     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1024:   } else MatSeqAIJGetKokkosView(A,&Aa); /* read & write matrix values */

1026:   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});

1028:   if (imode == INSERT_VALUES) MatSeqAIJRestoreKokkosViewWrite(A,&Aa);
1029:   else MatSeqAIJRestoreKokkosView(A,&Aa);
1030:   return 0;
1031: }

1033: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1034: {
1035:   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1036:   MatScalarKokkosView         Aa;
1037:   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1038:   PetscInt                    m = A->rmap->n;
1039:   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */

1041:   MatSeqAIJGetKokkosViewWrite(A,&Aa);
1042:   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1043:     PetscScalar tmp;
1044:     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1045:       tmp          = Aa(Ai(i));
1046:       Aa(Ai(i))    = Aa(Adiag(i));
1047:       Aa(Adiag(i)) = tmp;
1048:     }
1049:   });
1050:   MatSeqAIJRestoreKokkosViewWrite(A,&Aa);
1051:   return 0;
1052: }

1054: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1055: {
1056:   MatSeqAIJKokkosSyncHost(A);
1057:   MatLUFactorNumeric_SeqAIJ(B,A,info);
1058:   B->offloadmask = PETSC_OFFLOAD_CPU;
1059:   return 0;
1060: }

1062: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1063: {
1064:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;

1066:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1067:   A->boundtocpu  = PETSC_FALSE;

1069:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1070:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1071:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1072:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1073:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1074:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1075:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1076:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1077:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1078:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1079:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1080:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1081:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1082:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1083:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1084:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1085:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1086:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1087:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1088:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1089:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1090:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1091:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1093:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);
1094:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);
1095:   return 0;
1096: }

1098: PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1099: {
1100:   Mat_SeqAIJ *aseq;
1101:   PetscInt    i,m,n;


1105:   m = akok->nrows();
1106:   n = akok->ncols();
1107:   MatSetSizes(A,m,n,m,n);
1108:   MatSetType(A,MATSEQAIJKOKKOS);

1110:   /* Set up data structures of A as a MATSEQAIJ */
1111:   MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);
1112:   aseq = (Mat_SeqAIJ*)(A)->data;

1114:   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1115:   akok->j_dual.sync_host();

1117:   aseq->i            = akok->i_host_data();
1118:   aseq->j            = akok->j_host_data();
1119:   aseq->a            = akok->a_host_data();
1120:   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1121:   aseq->singlemalloc = PETSC_FALSE;
1122:   aseq->free_a       = PETSC_FALSE;
1123:   aseq->free_ij      = PETSC_FALSE;
1124:   aseq->nz           = akok->nnz();
1125:   aseq->maxnz        = aseq->nz;

1127:   PetscMalloc1(m,&aseq->imax);
1128:   PetscMalloc1(m,&aseq->ilen);
1129:   for (i=0; i<m; i++) {
1130:     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1131:   }

1133:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1134:   akok->nonzerostate = A->nonzerostate;
1135:   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1136:   MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1137:   MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1138:   return 0;
1139: }

1141: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1143:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1144:  */
1145: PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1146: {
1147:   MatCreate(comm,A);
1148:   MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);
1149:   return 0;
1150: }

1152: /* --------------------------------------------------------------------------------*/
1153: /*@C
1154:    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1155:    (the default parallel PETSc format). This matrix will ultimately be handled by
1156:    Kokkos for calculations. For good matrix
1157:    assembly performance the user should preallocate the matrix storage by setting
1158:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1159:    performance during matrix assembly can be increased by more than a factor of 50.

1161:    Collective

1163:    Input Parameters:
1164: +  comm - MPI communicator, set to PETSC_COMM_SELF
1165: .  m - number of rows
1166: .  n - number of columns
1167: .  nz - number of nonzeros per row (same for all rows)
1168: -  nnz - array containing the number of nonzeros in the various rows
1169:          (possibly different for each row) or NULL

1171:    Output Parameter:
1172: .  A - the matrix

1174:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1175:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1176:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1178:    Notes:
1179:    If nnz is given then nz is ignored

1181:    The AIJ format (also called the Yale sparse matrix format or
1182:    compressed row storage), is fully compatible with standard Fortran 77
1183:    storage.  That is, the stored row and column indices can begin at
1184:    either one (as in Fortran) or zero.  See the users' manual for details.

1186:    Specify the preallocated storage with either nz or nnz (not both).
1187:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1188:    allocation.  For large problems you MUST preallocate memory or you
1189:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1191:    By default, this format uses inodes (identical nodes) when possible, to
1192:    improve numerical efficiency of matrix-vector products and solves. We
1193:    search for consecutive rows with the same nonzero structure, thereby
1194:    reusing matrix information to achieve increased efficiency.

1196:    Level: intermediate

1198: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1199: @*/
1200: PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1201: {
1202:   PetscKokkosInitializeCheck();
1203:   MatCreate(comm,A);
1204:   MatSetSizes(*A,m,n,m,n);
1205:   MatSetType(*A,MATSEQAIJKOKKOS);
1206:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1207:   return 0;
1208: }

1210: typedef Kokkos::TeamPolicy<>::member_type team_member;
1211: //
1212: // This factorization exploits block diagonal matrices with "Nf" (not used).
1213: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1214: //
1215: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1216: {
1217:   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1218:   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1219:   IS                 isrow = b->row,isicol = b->icol;
1220:   const PetscInt     *r_h,*ic_h;
1221:   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1222:   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1223:   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1224:   PetscBool          row_identity,col_identity;
1225:   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used

1228:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);
1230:   ISGetIndices(isrow,&r_h);
1231:   ISGetIndices(isicol,&ic_h);
1232:   ISGetSize(isicol,&nc);
1233:   PetscLogGpuTimeBegin();
1234:   MatSeqAIJKokkosSyncDevice(A);
1235:   {
1236: #define KOKKOS_SHARED_LEVEL 1
1237:     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1238:     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1239:     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1240:     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1241:     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1242:     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1243:     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1244:     size_t flops_h = 0.0;
1245:     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1246:     Kokkos::View<size_t> d_flops_k ("flops");
1247:     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1248:     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1249:     Kokkos::deep_copy (d_flops_k, h_flops_k);
1250:     Kokkos::deep_copy (d_r_k, h_r_k);
1251:     Kokkos::deep_copy (d_ic_k, h_ic_k);
1252:     // Fill A --> fact
1253:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1254:         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1255:         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1256:         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1257:         // zero rows of B
1258:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1259:             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1260:             PetscScalar *baL = ba_d + bi_d[rowb];
1261:             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1262:             /* zero (unfactored row) */
1263:             for (int j=0;j<nzbL;j++) baL[j] = 0;
1264:             for (int j=0;j<nzbU;j++) baU[j] = 0;
1265:           });
1266:         // copy A into B
1267:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1268:             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1269:             const PetscScalar *av    = aa_d + ai_d[rowa];
1270:             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1271:             /* load in initial (unfactored row) */
1272:             for (int j=0;j<nza;j++) {
1273:               PetscInt    colb = ic[ajtmp[j]];
1274:               PetscScalar vala = av[j];
1275:               if (colb == rowb) {
1276:                 *(ba_d + bdiag_d[rowb]) = vala;
1277:               } else {
1278:                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1279:                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1280:                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1281:                 for (int j=0; j<nz ; j++) {
1282:                   if (pbj[j] == colb) {
1283:                     pba[j] = vala;
1284:                     set++;
1285:                     break;
1286:                   }
1287:                 }
1288:                #if !defined(PETSC_HAVE_SYCL)
1289:                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1290:                #endif
1291:               }
1292:             }
1293:           });
1294:       });
1295:     Kokkos::fence();

1297:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1298:         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1299:         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1300:         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1301:         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1302:         const PetscInt  start = field*nloc, end = start + nloc;
1303:         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1304:         // A22 panel update for each row A(1,:) and col A(:,1)
1305:         for (int ii=start; ii<end-1; ii++) {
1306:           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1307:           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1308:           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1309:           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1310:           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1311:               PetscInt kIdx = j*Ni + field_block_idx;
1312:               if (kIdx >= nzUi) /* void */ ;
1313:               else {
1314:                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1315:                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1316:                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1317:                 size_t         st_idx;
1318:                 // find and do L(k,i) = A(:k,i) / A(i,i)
1319:                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1320:                 // get column, there has got to be a better way
1321:                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1322:                     if (pjL[j] == ii) {
1323:                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1324:                       idx = j; // output
1325:                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1326:                     }
1327:                 }, st_idx);
1328:                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1329: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1330:                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1331: #endif
1332:                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1333:                 // U(i+1,:end)
1334:                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1335:                       PetscScalar Uij = baUi[uiIdx];
1336:                       PetscInt    col = bjUi[uiIdx];
1337:                       if (col==myk) {
1338:                         // A_kk = A_kk - L_ki * U_ij(k)
1339:                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1340:                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1341:                       } else {
1342:                         PetscScalar    *start, *end, *pAkjv=NULL;
1343:                         PetscInt       high, low;
1344:                         const PetscInt *startj;
1345:                         if (col<myk) { // L
1346:                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1347:                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1348:                           start = pLki+1; // start at pLki+1, A22(myk,1)
1349:                           startj= bj_d + bi_d[myk] + idx;
1350:                           end   = ba_d + bi_d[myk+1];
1351:                         } else {
1352:                           PetscInt idx = bdiag_d[myk+1]+1;
1353:                           start = ba_d + idx;
1354:                           startj= bj_d + idx;
1355:                           end   = ba_d + bdiag_d[myk];
1356:                         }
1357:                         // search for 'col', use bisection search - TODO
1358:                         low  = 0;
1359:                         high = (PetscInt)(end-start);
1360:                         while (high-low > 5) {
1361:                           int t = (low+high)/2;
1362:                           if (startj[t] > col) high = t;
1363:                           else                 low  = t;
1364:                         }
1365:                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1366:                           if (startj[pAkjv-start] == col) break;
1367:                         }
1368: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1369:                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1370: #endif
1371:                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1372:                       }
1373:                     });
1374:               }
1375:             });
1376:           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1377:           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1378:         } /* endof for (i=0; i<n; i++) { */
1379:         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1380:       });
1381:     Kokkos::fence();
1382:     Kokkos::deep_copy (h_flops_k, d_flops_k);
1383:     PetscLogGpuFlops((PetscLogDouble)h_flops_k());
1384:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1385:         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1386:         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1387:         /* Invert diagonal for simpler triangular solves */
1388:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1389:             int i = start + outer_index*Ni + lg_rank%Ni;
1390:             if (i < end) {
1391:               PetscScalar *pv = ba_d + bdiag_d[i];
1392:               *pv = 1.0/(*pv);
1393:             }
1394:           });
1395:       });
1396:   }
1397:   PetscLogGpuTimeEnd();
1398:   ISRestoreIndices(isicol,&ic_h);
1399:   ISRestoreIndices(isrow,&r_h);

1401:   ISIdentity(isrow,&row_identity);
1402:   ISIdentity(isicol,&col_identity);
1403:   if (b->inode.size) {
1404:     B->ops->solve = MatSolve_SeqAIJ_Inode;
1405:   } else if (row_identity && col_identity) {
1406:     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1407:   } else {
1408:     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1409:   }
1410:   B->offloadmask = PETSC_OFFLOAD_GPU;
1411:   MatSeqAIJKokkosSyncHost(B); // solve on CPU
1412:   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1413:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1414:   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1415:   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1416:   B->assembled              = PETSC_TRUE;
1417:   B->preallocated           = PETSC_TRUE;

1419:   return 0;
1420: }

1422: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1423: {
1424:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
1425:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1426:   return 0;
1427: }

1429: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1430: {
1431:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;

1433:   if (!factors->sptrsv_symbolic_completed) {
1434:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1435:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1436:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1437:   }
1438:   return 0;
1439: }

1441: /* Check if we need to update factors etc for transpose solve */
1442: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1443: {
1444:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1445:   MatColIdxType             n = A->rmap->n;

1447:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1448:     /* Update L^T and do sptrsv symbolic */
1449:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1450:     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1451:     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1452:     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));

1454:     KokkosKernels::Impl::transpose_matrix<
1455:       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1456:       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1457:       MatRowMapKokkosView,DefaultExecutionSpace>(
1458:         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1459:         factors->iLt_d,factors->jLt_d,factors->aLt_d);

1461:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1462:       We have to sort the indices, until KK provides finer control options.
1463:     */
1464:     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1465:       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1466:         factors->iLt_d,factors->jLt_d,factors->aLt_d);

1468:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);

1470:     /* Update U^T and do sptrsv symbolic */
1471:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1472:     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1473:     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1474:     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));

1476:     KokkosKernels::Impl::transpose_matrix<
1477:       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1478:       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1479:       MatRowMapKokkosView,DefaultExecutionSpace>(
1480:         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1481:         factors->iUt_d,factors->jUt_d,factors->aUt_d);

1483:     /* Sort indices. See comments above */
1484:     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1485:       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1486:         factors->iUt_d,factors->jUt_d,factors->aUt_d);

1488:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1489:     factors->transpose_updated = PETSC_TRUE;
1490:   }
1491:   return 0;
1492: }

1494: /* Solve Ax = b, with A = LU */
1495: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1496: {
1497:   ConstPetscScalarKokkosView     bv;
1498:   PetscScalarKokkosView          xv;
1499:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;

1501:   PetscLogGpuTimeBegin();
1502:   MatSeqAIJKokkosSymbolicSolveCheck(A);
1503:   VecGetKokkosView(b,&bv);
1504:   VecGetKokkosViewWrite(x,&xv);
1505:   /* Solve L tmpv = b */
1506:   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
1507:   /* Solve Ux = tmpv */
1508:   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
1509:   VecRestoreKokkosView(b,&bv);
1510:   VecRestoreKokkosViewWrite(x,&xv);
1511:   PetscLogGpuTimeEnd();
1512:   return 0;
1513: }

1515: /* Solve A^T x = b, where A^T = U^T L^T */
1516: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1517: {
1518:   ConstPetscScalarKokkosView     bv;
1519:   PetscScalarKokkosView          xv;
1520:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;

1522:   PetscLogGpuTimeBegin();
1523:   MatSeqAIJKokkosTransposeSolveCheck(A);
1524:   VecGetKokkosView(b,&bv);
1525:   VecGetKokkosViewWrite(x,&xv);
1526:   /* Solve U^T tmpv = b */
1527:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);

1529:   /* Solve L^T x = tmpv */
1530:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1531:   VecRestoreKokkosView(b,&bv);
1532:   VecRestoreKokkosViewWrite(x,&xv);
1533:   PetscLogGpuTimeEnd();
1534:   return 0;
1535: }

1537: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1538: {
1539:   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1540:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1541:   PetscInt                       fill_lev = info->levels;

1543:   PetscLogGpuTimeBegin();
1544:   MatSeqAIJKokkosSyncDevice(A);

1546:   auto a_d = aijkok->a_dual.view_device();
1547:   auto i_d = aijkok->i_dual.view_device();
1548:   auto j_d = aijkok->j_dual.view_device();

1550:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);

1552:   B->assembled                       = PETSC_TRUE;
1553:   B->preallocated                    = PETSC_TRUE;
1554:   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1555:   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1556:   B->ops->matsolve                   = NULL;
1557:   B->ops->matsolvetranspose          = NULL;
1558:   B->offloadmask                     = PETSC_OFFLOAD_GPU;

1560:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1561:   factors->transpose_updated         = PETSC_FALSE;
1562:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1563:   /* TODO: log flops, but how to know that? */
1564:   PetscLogGpuTimeEnd();
1565:   return 0;
1566: }

1568: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1569: {
1570:   Mat_SeqAIJKokkos               *aijkok;
1571:   Mat_SeqAIJ                     *b;
1572:   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1573:   PetscInt                       fill_lev = info->levels;
1574:   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1575:   PetscInt                       n = A->rmap->n;

1577:   MatSeqAIJKokkosSyncDevice(A);
1578:   /* Rebuild factors */
1579:   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1580:   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}

1582:   /* Create a spiluk handle and then do symbolic factorization */
1583:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1584:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);

1586:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1588:   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1589:   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1590:   Kokkos::realloc(factors->iU_d,n+1);
1591:   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());

1593:   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1594:   auto i_d = aijkok->i_dual.view_device();
1595:   auto j_d = aijkok->j_dual.view_device();
1596:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1597:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1599:   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1600:   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1601:   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1602:   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());

1604:   /* TODO: add options to select sptrsv algorithms */
1605:   /* Create sptrsv handles for L, U and their transpose */
1606:  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1607:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1608:  #else
1609:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1610:  #endif

1612:   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1613:   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1614:   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1615:   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);

1617:   /* Fill fields of the factor matrix B */
1618:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
1619:   b     = (Mat_SeqAIJ*)B->data;
1620:   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1621:   B->info.fill_ratio_given  = info->fill;
1622:   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);

1624:   B->offloadmask            = PETSC_OFFLOAD_GPU;
1625:   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1626:   return 0;
1627: }

1629: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1630: {
1631:   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1632:   const PetscInt   nrows   = A->rmap->n;

1634:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
1635:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1636:   // move B data into Kokkos
1637:   MatSeqAIJKokkosSyncDevice(B); // create aijkok
1638:   MatSeqAIJKokkosSyncDevice(A); // create aijkok
1639:   {
1640:     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1641:     if (!baijkok->diag_d.extent(0)) {
1642:       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1643:       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1644:       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1645:     }
1646:   }
1647:   return 0;
1648: }

1650: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1651: {
1652:   *type = MATSOLVERKOKKOS;
1653:   return 0;
1654: }

1656: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1657: {
1658:   *type = MATSOLVERKOKKOSDEVICE;
1659:   return 0;
1660: }

1662: /*MC
1663:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1664:   on a single GPU of type, SeqAIJKokkos, AIJKokkos.

1666:   Level: beginner

1668: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1669: M*/
1670: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1671: {
1672:   PetscInt       n = A->rmap->n;

1674:   MatCreate(PetscObjectComm((PetscObject)A),B);
1675:   MatSetSizes(*B,n,n,n,n);
1676:   (*B)->factortype = ftype;
1677:   PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
1678:   MatSetType(*B,MATSEQAIJKOKKOS);

1680:   if (ftype == MAT_FACTOR_LU) {
1681:     MatSetBlockSizesFromMats(*B,A,A);
1682:     (*B)->canuseordering         = PETSC_TRUE;
1683:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1684:   } else if (ftype == MAT_FACTOR_ILU) {
1685:     MatSetBlockSizesFromMats(*B,A,A);
1686:     (*B)->canuseordering         = PETSC_FALSE;
1687:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1688:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1690:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
1691:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);
1692:   return 0;
1693: }

1695: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1696: {
1697:   PetscInt       n = A->rmap->n;

1699:   MatCreate(PetscObjectComm((PetscObject)A),B);
1700:   MatSetSizes(*B,n,n,n,n);
1701:   (*B)->factortype = ftype;
1702:   (*B)->canuseordering = PETSC_TRUE;
1703:   PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
1704:   MatSetType(*B,MATSEQAIJKOKKOS);

1706:   if (ftype == MAT_FACTOR_LU) {
1707:     MatSetBlockSizesFromMats(*B,A,A);
1708:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1709:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");

1711:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
1712:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);
1713:   return 0;
1714: }

1716: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1717: {
1718:   MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);
1719:   MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);
1720:   MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);
1721:   return 0;
1722: }

1724: /* Utility to print out a KokkosCsrMatrix for debugging */
1725: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1726: {
1727:   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1728:   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1729:   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1730:   const PetscInt    *i = iv.data();
1731:   const PetscInt    *j = jv.data();
1732:   const PetscScalar *a = av.data();
1733:   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();

1735:   PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);
1736:   for (PetscInt k=0; k<m; k++) {
1737:     PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);
1738:     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1739:       PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);
1740:     }
1741:     PetscPrintf(PETSC_COMM_SELF,"\n");
1742:   }
1743:   return 0;
1744: }