Actual source code: mpiaijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscsf.h>
  3: #include <petsc/private/sfimpl.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
  6: #include <KokkosSparse_spadd.hpp>

  8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
  9: {
 10:   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
 11:   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;

 13:   MatAssemblyEnd_MPIAIJ(A,mode);
 14:   if (aijkok && aijkok->device_mat_d.data()) {
 15:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 16:   }

 18:   return 0;
 19: }

 21: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 22: {
 23:   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;

 25:   PetscLayoutSetUp(mat->rmap);
 26:   PetscLayoutSetUp(mat->cmap);
 27: #if defined(PETSC_USE_DEBUG)
 28:   if (d_nnz) {
 29:     PetscInt i;
 30:     for (i=0; i<mat->rmap->n; i++) {
 32:     }
 33:   }
 34:   if (o_nnz) {
 35:     PetscInt i;
 36:     for (i=0; i<mat->rmap->n; i++) {
 38:     }
 39:   }
 40: #endif
 41: #if defined(PETSC_USE_CTABLE)
 42:   PetscTableDestroy(&mpiaij->colmap);
 43: #else
 44:   PetscFree(mpiaij->colmap);
 45: #endif
 46:   PetscFree(mpiaij->garray);
 47:   VecDestroy(&mpiaij->lvec);
 48:   VecScatterDestroy(&mpiaij->Mvctx);
 49:   /* Because the B will have been resized we simply destroy it and create a new one each time */
 50:   MatDestroy(&mpiaij->B);

 52:   if (!mpiaij->A) {
 53:     MatCreate(PETSC_COMM_SELF,&mpiaij->A);
 54:     MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);
 55:     PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);
 56:   }
 57:   if (!mpiaij->B) {
 58:     PetscMPIInt size;
 59:     MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);
 60:     MatCreate(PETSC_COMM_SELF,&mpiaij->B);
 61:     MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);
 62:     PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);
 63:   }
 64:   MatSetType(mpiaij->A,MATSEQAIJKOKKOS);
 65:   MatSetType(mpiaij->B,MATSEQAIJKOKKOS);
 66:   MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);
 67:   MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);
 68:   mat->preallocated = PETSC_TRUE;
 69:   return 0;
 70: }

 72: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
 73: {
 74:   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
 75:   PetscInt       nt;

 77:   VecGetLocalSize(xx,&nt);
 79:   VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
 80:   (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);
 81:   VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
 82:   (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);
 83:   return 0;
 84: }

 86: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
 87: {
 88:   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
 89:   PetscInt       nt;

 91:   VecGetLocalSize(xx,&nt);
 93:   VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
 94:   (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);
 95:   VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
 96:   (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);
 97:   return 0;
 98: }

100: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
101: {
102:   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
103:   PetscInt       nt;

105:   VecGetLocalSize(xx,&nt);
107:   (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);
108:   (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);
109:   VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
110:   VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
111:   return 0;
112: }

114: /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
115:    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
116:    C still uses local column ids. Their corresponding global column ids are returned in glob.
117: */
118: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
119: {
120:   Mat            Ad,Ao;
121:   const PetscInt *cmap;

123:   MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);
124:   MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);
125:   if (glob) {
126:     PetscInt cst, i, dn, on, *gidx;
127:     MatGetLocalSize(Ad,NULL,&dn);
128:     MatGetLocalSize(Ao,NULL,&on);
129:     MatGetOwnershipRangeColumn(mat,&cst,NULL);
130:     PetscMalloc1(dn+on,&gidx);
131:     for (i=0; i<dn; i++) gidx[i]    = cst + i;
132:     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
133:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
134:   }
135:   return 0;
136: }

138: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
139: struct MatMatStruct {
140:   MatRowMapKokkosView   Cdstart; /* Used to split sequential matrix into petsc's A, B format */
141:   PetscSF               sf; /* SF to send/recv matrix entries */
142:   MatScalarKokkosView   abuf; /* buf of mat values in send/recv */
143:   Mat                   C1,C2,B_local;
144:   KokkosCsrMatrix       C1_global,C2_global,C_global;
145:   KernelHandle          kh;
146:   MatMatStruct() {
147:     C1 = C2 = B_local = NULL;
148:     sf = NULL;
149:   }

151:   ~MatMatStruct() {
152:     MatDestroy(&C1);
153:     MatDestroy(&C2);
154:     MatDestroy(&B_local);
155:     PetscSFDestroy(&sf);
156:     kh.destroy_spadd_handle();
157:   }
158: };

160: struct MatMatStruct_AB : public MatMatStruct {
161:   MatColIdxKokkosView   rows;
162:   MatRowMapKokkosView   rowoffset;
163:   Mat                   B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */

165:   MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
166:   ~MatMatStruct_AB() {
167:     MatDestroy(&B_other);
168:     MatDestroy(&C_petsc);
169:   }
170: };

172: struct MatMatStruct_AtB : public MatMatStruct {
173:   MatRowMapKokkosView   srcrowoffset,dstrowoffset;
174: };

176: struct MatProductData_MPIAIJKokkos
177: {
178:   MatMatStruct_AB   *mmAB;
179:   MatMatStruct_AtB  *mmAtB;
180:   PetscBool         reusesym;

182:   MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
183:   ~MatProductData_MPIAIJKokkos() {
184:     delete mmAB;
185:     delete mmAtB;
186:   }
187: };

189: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
190: {
191:   delete static_cast<MatProductData_MPIAIJKokkos*>(data);
192:   return 0;
193: }

195: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix

197:    Input Parameters:
198: +  A       - the MATSEQAIJKOKKOS matrix
199: .  N       - new column size for the returned Kokkos matrix
200: -  l2g     - a map that maps old col ids to new col ids

202:    Output Parameters:
203: .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
204:  */
205: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
206: {
207:   KokkosCsrMatrix&         orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
208:   MatColIdxKokkosView      jg("jg",orig.nnz()); /* New j array for csrmat */

210:   Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));});
211:   csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg);
212:   return 0;
213: }

215: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
216:    It is similar to MatCreateMPIAIJWithSplitArrays.

218:   Input Parameters:
219: +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
220: .  A     - the diag matrix using local col ids
221: -  B     - the offdiag matrix using global col ids

223:   Output Parameters:
224: .  mat   - the updated MATMPIAIJKOKKOS matrix
225: */
226: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
227: {
228:   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
229:   PetscInt            m,n,M,N,Am,An,Bm,Bn;
230:   Mat_SeqAIJKokkos    *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);

232:   MatGetSize(mat,&M,&N);
233:   MatGetLocalSize(mat,&m,&n);
234:   MatGetLocalSize(A,&Am,&An);
235:   MatGetLocalSize(B,&Bm,&Bn);

241:   mpiaij->A = A;
242:   mpiaij->B = B;

244:   mat->preallocated     = PETSC_TRUE;
245:   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */

247:   MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);
248:   MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);
249:   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
250:     also gets mpiaij->B compacted, with its col ids and size reduced
251:   */
252:   MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);
253:   MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);
254:   MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);

256:   /* Update bkok with new local col ids (stored on host) and size */
257:   bkok->j_dual.modify_host();
258:   bkok->j_dual.sync_device();
259:   bkok->SetColSize(mpiaij->B->cmap->n);
260:   return 0;
261: }

263: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).

265:    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
266:    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
267:    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
268:    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).

270:    Collective on comm of ownerSF

272:    Input Parameters:
273: +   B       - the SEQAIJKOKKOS matrix, using local col ids
274: .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
275: .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
276: .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
277: .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)

279:    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
280: +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
281: .   abuf      - buffer for sending matrix values
282: .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
283:                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
284: .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
285: -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
286: */
287: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
288:                                            PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
289:                                            MatRowMapKokkosView& rowoffset,Mat& C)
290: {
291:   Mat_SeqAIJKokkos             *bkok,*ckok;

293:   MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
294:   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);

296:   if (reuse == MAT_REUSE_MATRIX) {
297:     ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);

299:     const auto& Ba = bkok->a_dual.view_device();
300:     const auto& Bi = bkok->i_dual.view_device();
301:     const auto& Ca = ckok->a_dual.view_device();

303:     /* Copy Ba to abuf */
304:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
305:       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
306:       PetscInt r    = rows(i);
307:       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
308:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
309:         abuf(base+k) = Ba(Bi(r)+k);
310:       });
311:     });

313:     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
314:     PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE); /* TODO: get memtype for abuf */
315:     PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);
316:     ckok->a_dual.modify_device();
317:   } else if (reuse == MAT_INITIAL_MATRIX) {
318:     MPI_Comm       comm;
319:     PetscMPIInt    tag;
320:     PetscInt       k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;

322:     PetscObjectGetComm((PetscObject)ownerSF,&comm);
323:     PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);
324:     Cm   = nleaves; /* row size of C */
325:     Cn   = N;  /* col size of C, which initially uses global ids, so we can safely set its col size as N */

327:     /* Get row lens (nz) of B's rows for later fast query */
328:     PetscInt       *Browlens;
329:     const PetscInt *tmp = bkok->i_host_data();
330:     PetscMalloc1(nroots,&Browlens);
331:     for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];

333:     /* By ownerSF, each proc gets lens of rows of C */
334:     MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
335:     Ci_h    = Ci.view_host().data();
336:     Ci_h[0] = 0;
337:     PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);
338:     PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);
339:     for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
340:     Cnnz    = Ci_h[Cm];
341:     Ci.modify_host();
342:     Ci.sync_device();

344:     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
345:     MatColIdxKokkosDualView  Cj("j",Cnnz);
346:     MatScalarKokkosDualView  Ca("a",Cnnz);

348:     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
349:     const PetscMPIInt *iranks,*ranks;
350:     const PetscInt    *ioffset,*irootloc,*roffset;
351:     PetscInt          i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
352:     MPI_Request       *reqs;

354:     PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc); /* irootloc[] contains indices of rows I need to send to each receiver */
355:     PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/); /* recv info */

357:     /* figure out offsets at the send buffer, to build the SF
358:       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
359:       rowptr[] - stores offsets for data of each row in abuf

361:       rdisp[]  - to receive sdisp[]
362:     */
363:     PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);
364:     MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
365:     rowptr = rowptr_h.data();

367:     sdisp[0] = 0;
368:     rowptr[0]  = 0;
369:     for (i=0; i<niranks; i++) { /* for each receiver */
370:       PetscInt len, nz = 0;
371:       for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
372:         len         = Browlens[irootloc[j]];
373:         rowptr[j+1] = rowptr[j] + len;
374:         nz         += len;
375:       }
376:       sdisp[i+1] = sdisp[i] + nz;
377:     }
378:     PetscCommGetNewTag(comm,&tag);
379:     for (i=0; i<nranks; i++)  MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);
380:     for (i=0; i<niranks; i++) MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);
381:     MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);

383:     PetscInt    nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
384:     PetscInt    nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
385:     PetscSFNode *iremote;
386:     PetscMalloc1(nleaves2,&iremote);
387:     for (i=0; i<nranks; i++) { /* for each sender */
388:       k = 0;
389:       for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
390:         iremote[j].rank  = ranks[i];
391:         iremote[j].index = rdisp[i] + k;
392:         k++;
393:       }
394:     }
395:     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
396:     PetscSFCreate(comm,&bcastSF);
397:     PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);

399:     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
400:       from local to global. Then use bcastSF to fill Ca, Cj.
401:     */
402:     ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
403:     MatColIdxKokkosView          rows("rows",ioffset[niranks]);
404:     Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */

406:     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */

408:     MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
409:     abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */

411:     const auto& Ba = bkok->a_dual.view_device();
412:     const auto& Bi = bkok->i_dual.view_device();
413:     const auto& Bj = bkok->j_dual.view_device();

415:     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
416:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
417:       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
418:       PetscInt r    = rows(i);
419:       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
420:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
421:         abuf(base+k) = Ba(Bi(r)+k);
422:         jbuf(base+k) = l2g(Bj(Bi(r)+k));
423:       });
424:     });

426:     /* Send abuf & jbuf to fill Ca, Cj */
427:     PetscSFBcastBegin(bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);
428:     PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);
429:     PetscSFBcastEnd  (bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);
430:     PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);
431:     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
432:     Cj.sync_host();
433:     Ca.modify_device();

435:     /* Construct C with Ca, Ci, Cj */
436:     auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
437:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);
438:     PetscFree3(sdisp,rdisp,reqs);
439:     PetscFree(Browlens);
440:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
441:   return 0;
442: }

444: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)

446:   It is the reverse of MatSeqAIJKokkosBcast in some sense.

448:   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
449:   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
450:   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.

452:   Input Parameters:
453: +  A        - the SEQAIJKOKKOS matrix to be reduced
454: .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
455: .  local    - true if A uses local col ids; false if A is already in global col ids.
456: .  N        - if local, N is A's global col size
457: .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
458: -  ownerSF  - the SF specifies ownership (root) of rows in A

460:   Output Parameters:
461: +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
462: .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
463: .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
464: .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
465:                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
466: -  C            - the matrix made up by rows sent to me from other ranks, using global col ids

468:    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
469:  */
470: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
471:                                             PetscSF& reduceSF,MatScalarKokkosView& abuf,
472:                                             MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
473:                                             KokkosCsrMatrix& C)
474: {
475:   PetscInt               i,r,Am,An,Annz,Cnnz,nrows;
476:   const PetscInt         *Ai;
477:   Mat_SeqAIJKokkos       *akok;

479:   MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
480:   MatGetSize(A,&Am,&An);
481:   Ai   = static_cast<Mat_SeqAIJ*>(A->data)->i;
482:   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
483:   Annz = Ai[Am];

485:   if (reuse == MAT_REUSE_MATRIX) {
486:     /* Send Aa to abuf */
487:     PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
488:     PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);

490:     /* Copy abuf to Ca */
491:     const MatScalarKokkosView& Ca = C.values;
492:     nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
493:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
494:       PetscInt i   = t.league_rank();
495:       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
496:       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
497:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
498:     });
499:   } else if (reuse == MAT_INITIAL_MATRIX) {
500:     MPI_Comm               comm;
501:     MPI_Request            *reqs;
502:     PetscMPIInt            tag;
503:     PetscInt               Cm;

505:     PetscObjectGetComm((PetscObject)ownerSF,&comm);
506:     PetscCommGetNewTag(comm,&tag);

508:     PetscInt niranks,nranks,nroots,nleaves;
509:     const PetscMPIInt *iranks,*ranks;
510:     const PetscInt *ioffset,*rows,*roffset;  /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
511:     PetscSFSetUp(ownerSF);
512:     PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows); /* recv info: iranks[] will send rows to me */
513:     PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/); /* send info */
514:     PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);
516:     Cm    = nroots;
517:     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */

519:     /* Tell owners how long each row I will send */
520:     PetscInt                *srowlens; /* send buf of row lens */
521:     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
522:     PetscInt                *rrowlens = rrowlens_h.data();

524:     PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);
525:     for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
526:     rrowlens[0] = 0;
527:     rrowlens++; /* shift the pointer to make the following expression more readable */
528:     for (i=0; i<niranks; i++)MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);
529:     for (i=0; i<nranks; i++) MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);
530:     MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);

532:     /* Owner builds Ci on host by histogramming rrowlens[] */
533:     MatRowMapKokkosViewHost Ci_h("i",Cm+1);
534:     Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
535:     MatRowMapType *Ci_ptr = Ci_h.data();

537:     for (i=0; i<nrows; i++) {
538:       r = rows[i]; /* local row id of i-th received row */
539:      #if defined(PETSC_USE_DEBUG)
541:      #endif
542:       Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
543:     }
544:     for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
545:     Cnnz = Ci_ptr[Cm];

547:     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
548:     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
549:     PetscInt                *dstrowoffset_hptr = dstrowoffset_h.data();
550:     PetscInt                *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */

552:     PetscCalloc1(Cm,&currowlens); /* Init with zero, to be added to */
553:     for (i=0; i<nrows; i++) { /* for each row I receive */
554:       r                    = rows[i]; /* row id in C */
555:       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
556:       currowlens[r]       += rrowlens[i]; /* accumulate to length of row r in C */
557:     }
558:     PetscFree(currowlens);

560:     rrowlens--;
561:     for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
562:     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
563:     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */

565:     /* Build the reduceSF, which performs buffer to buffer send/recv */
566:     PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
567:     PetscMalloc2(niranks,&sdisp,nranks,&rdisp);
568:     for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
569:     for (i=0; i<nranks; i++)  MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);
570:     for (i=0; i<niranks; i++) MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);
571:     MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);

573:     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
574:     PetscInt    nroots2 = Cnnz,nleaves2 = Annz;
575:     PetscSFNode *iremote;
576:     PetscMalloc1(nleaves2,&iremote); /* no free, since memory will be given to reduceSF */
577:     for (i=0; i<nranks; i++) {
578:       PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
579:       PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
580:       PetscInt nz       = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
581:       for (PetscInt k=0; k<nz; k++) {
582:         iremote[leafbase+k].rank  = ranks[i];
583:         iremote[leafbase+k].index = rootbase + k;
584:       }
585:     }
586:     PetscSFCreate(comm,&reduceSF);
587:     PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
588:     PetscFree2(sdisp,rdisp);

590:     /* Reduce Aa, Ajg to abuf and jbuf */

592:     /* If A uses local col ids, convert them to global ones before sending */
593:     MatColIdxKokkosView Ajg;
594:     if (local) {
595:       Ajg = MatColIdxKokkosView("j",Annz);
596:       const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
597:       Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
598:     } else {
599:       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
600:     }

602:     MatColIdxKokkosView   jbuf("jbuf",Cnnz);
603:     abuf = MatScalarKokkosView("abuf",Cnnz);
604:     PetscSFReduceBegin(reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);
605:     PetscSFReduceEnd  (reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);
606:     PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
607:     PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);

609:     /* Copy data from abuf, jbuf to Ca, Cj */
610:     MatRowMapKokkosView    Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
611:     MatColIdxKokkosView    Cj("j",Cnnz);
612:     MatScalarKokkosView    Ca("a",Cnnz);

614:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
615:       PetscInt i   = t.league_rank();
616:       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
617:       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
618:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
619:         Ca(dst+k) = abuf(src+k);
620:         Cj(dst+k) = jbuf(src+k);
621:       });
622:     });

624:     /* Build C with Ca, Ci, Cj */
625:     C    = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
626:     PetscFree2(srowlens,reqs);
627:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
628:   return 0;
629: }

631: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix

633:   Input Parameters:
634: +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
635: .  reuse    - indicate whether the matrix has called this function before
636: .  csrmat   - the KokkosCsrMatrix, of size m,N
637: -  Cdstart  - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
638:               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
639:               entry is 5, then Cdstart[i] = 3.

641:   Output Parameters:
642: +  C        - the updated MATMPIAIJKOKKOS matrix
643: -  Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter

645:   Notes:
646:    Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
647:  */
648: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
649: {
650:   const MatScalarKokkosView&      Ca = csrmat.values;
651:   const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
652:   PetscInt                        m,n,N;

654:   MatGetLocalSize(C,&m,&n);
655:   MatGetSize(C,NULL,&N);

657:   if (reuse == MAT_REUSE_MATRIX) {
658:     Mat_MPIAIJ                  *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
659:     Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
660:     Mat_SeqAIJKokkos            *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
661:     const MatScalarKokkosView&  Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
662:     const MatRowMapKokkosView&  Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();

664:     /* Fill 'a' of Cd and Co on device */
665:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
666:       PetscInt i       = t.league_rank(); /* row i */
667:       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
668:       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
669:       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
670:       PetscInt cdend   = cdstart + cdlen;
671:       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
672:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
673:         if (k < cdstart) {  /* k in [0, cdstart) */
674:           Coa(Coi(i)+k) = Ca(Ci(i)+k);
675:         } else if (k < cdend) { /* k in [cdstart, cdend) */
676:           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
677:         } else { /* k in [cdend, clen) */
678:           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
679:         }
680:       });
681:     });

683:     akok->a_dual.modify_device();
684:     bkok->a_dual.modify_device();
685:   } else if (reuse == MAT_INITIAL_MATRIX) {
686:     Mat                         Cd,Co;
687:     const MatColIdxKokkosView&  Cj = csrmat.graph.entries;
688:     MatRowMapKokkosDualView     Cdi_dual("i",m+1),Coi_dual("i",m+1);
689:     MatRowMapKokkosView         Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
690:     PetscInt                    cstart,cend;

692:     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
693:        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
694:        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
695:        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
696:        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
697:      */
698:     Cdstart = MatRowMapKokkosView("Cdstart",m);
699:     PetscLayoutGetRange(C->cmap,&cstart,&cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */

701:     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
702:       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
703:      */
704:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
705:       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
706:         PetscInt i = t.league_rank(); /* row i */
707:         PetscInt j,first,count,step;

709:         if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
710:           Cdi(0) = 0;
711:           Coi(0) = 0;
712:         }

714:         /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
715:           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
716:         */
717:         count = Ci(i+1)-Ci(i);
718:         first = Ci(i);
719:         while (count > 0) {
720:           j    = first;
721:           step = count / 2;
722:           j   += step;
723:           if (Cj(j) < cstart) {
724:             first  = ++j;
725:             count -= step + 1;
726:           } else count = step;
727:         }
728:         Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */

730:         /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
731:         count = Ci(i+1) - first;
732:         while (count > 0) {
733:           j    = first;
734:           step = count / 2;
735:           j   += step;
736:           if (Cj(j) < cend) {
737:             first  = ++j;
738:             count -= step + 1;
739:           } else count = step;
740:         }
741:         Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
742:         Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
743:       });
744:     });

746:     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
747:     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
748:       update += Cdi(i);
749:       if (final) Cdi(i) = update;
750:     });
751:     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
752:       update += Coi(i);
753:       if (final) Coi(i) = update;
754:     });

756:     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
757:        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
758:     */
759:     Cdi_dual.modify_device();
760:     Coi_dual.modify_device();
761:     Cdi_dual.sync_host();
762:     Coi_dual.sync_host();
763:     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
764:     PetscInt Co_nnz = Coi_dual.view_host().data()[m];

766:     /* With nnz, allocate a, j for Cd and Co */
767:     MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
768:     MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);

770:     /* Fill a, j of Cd and Co on device */
771:     MatColIdxKokkosView     Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
772:     MatScalarKokkosView     Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();

774:     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
775:       PetscInt i       = t.league_rank(); /* row i */
776:       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
777:       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
778:       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
779:       PetscInt cdend   = cdstart + cdlen;
780:       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
781:       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
782:         if (k < cdstart) { /* k in [0, cdstart) */
783:           Coa(Coi(i)+k) = Ca(Ci(i)+k);
784:           Coj(Coi(i)+k) = Cj(Ci(i)+k);
785:         } else if (k < cdend) { /* k in [cdstart, cdend) */
786:           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
787:           Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
788:         } else { /* k in [cdend, clen) */
789:           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
790:           Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
791:         }
792:       });
793:     });

795:     Cdj_dual.modify_device();
796:     Cda_dual.modify_device();
797:     Coj_dual.modify_device();
798:     Coa_dual.modify_device();
799:     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
800:     auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
801:     auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
802:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);
803:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);
804:     MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co); /* Coj will be converted to local ids within */
805:   }
806:   return 0;
807: }

809: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.

811:   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.

813:   Input Parameters:
814: +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
815: .  reuse    - indicate whether the matrix has called this function before
816: .  csrmat   - the KokkosCsrMatrix, of size m,N
817: -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
818:               entry of the diag block of C in csrmat's j array.

820:   Output Parameters:
821: +  C        - the updated MATMPIAIJKOKKOS matrix
822: -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter

824:   Notes: the input matrix's col ids and col size will be changed.
825: */
826: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
827: {
828:   Mat_SeqAIJKokkos       *ckok;
829:   ISLocalToGlobalMapping l2gmap;
830:   const PetscInt         *garray;
831:   PetscInt               sz;

833:   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
834:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);
835:   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
836:   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
837:   ckok->j_dual.sync_device();
838:   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */

840:   /* Build l2g -- the local to global mapping of C's cols */
841:   ISLocalToGlobalMappingGetIndices(l2gmap,&garray);
842:   ISLocalToGlobalMappingGetSize(l2gmap,&sz);

845:   ConstMatColIdxKokkosViewHost tmp(garray,sz);
846:   l2g = MatColIdxKokkosView("l2g",sz);
847:   Kokkos::deep_copy(l2g,tmp);

849:   ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);
850:   ISLocalToGlobalMappingDestroy(&l2gmap);
851:   return 0;
852: }

854: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos

856:   Input Parameters:
857: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
858: .  A        - an MPIAIJKOKKOS matrix
859: .  B        - an MPIAIJKOKKOS matrix
860: -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.

862:   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
863: */
864: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
865: {
866:   Mat_MPIAIJ                  *a = static_cast<Mat_MPIAIJ*>(A->data);
867:   Mat                         Ad = a->A,Ao = a->B; /* diag and offdiag of A */
868:   IS                          glob = NULL;
869:   const PetscInt              *garray;
870:   PetscInt                    N = B->cmap->N,sz;
871:   ConstMatColIdxKokkosView    l2g1; /* two temp maps mapping local col ids to global ones */
872:   MatColIdxKokkosView         l2g2;
873:   Mat                         C1,C2; /* intermediate matrices */

875:   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
876:   MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);
877:   MatProductCreate(Ad,mm->B_local,NULL,&C1);
878:   MatProductSetType(C1,MATPRODUCT_AB);
879:   MatProductSetFill(C1,product->fill);
880:   C1->product->api_user = product->api_user;
881:   MatProductSetFromOptions(C1);
883:   (*C1->ops->productsymbolic)(C1);

885:   ISGetIndices(glob,&garray);
886:   ISGetSize(glob,&sz);
887:   const auto& tmp  = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
888:   l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
889:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global);

891:   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
892:   MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,mm->abuf,mm->rows,mm->rowoffset,mm->B_other);

894:   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
895:   MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);
896:   MatProductCreate(Ao,mm->B_other,NULL,&C2);
897:   MatProductSetType(C2,MATPRODUCT_AB);
898:   MatProductSetFill(C2,product->fill);
899:   C2->product->api_user = product->api_user;
900:   MatProductSetFromOptions(C2);
902:   (*C2->ops->productsymbolic)(C2);
903:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global);

905:   /* C = C1 + C2.  We actually use their global col ids versions in adding */
906:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
907:   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
908:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
909:   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);

911:   mm->C1 = C1;
912:   mm->C2 = C2;
913:   ISRestoreIndices(glob,&garray);
914:   ISDestroy(&glob);
915:   return 0;
916: }

918: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos

920:   Input Parameters:
921: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
922: .  A        - an MPIAIJKOKKOS matrix
923: .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
924: .  localB   - Does B use local col ids? If false, then B is already in global col ids.
925: .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
926: .  l2g      - If localB, then l2g maps B's local col ids to global ones.
927: -  mm       - a struct used to stash intermediate data in AtB

929:   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
930: */
931: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
932: {
933:   Mat_MPIAIJ             *a = static_cast<Mat_MPIAIJ*>(A->data);
934:   Mat                    Ad = a->A,Ao = a->B; /* diag and offdiag of A */
935:   Mat                    C1,C2; /* intermediate matrices */

937:   /* C1 = Ad^t * B */
938:   MatProductCreate(Ad,B,NULL,&C1);
939:   MatProductSetType(C1,MATPRODUCT_AtB);
940:   MatProductSetFill(C1,product->fill);
941:   C1->product->api_user = product->api_user;
942:   MatProductSetFromOptions(C1);
944:   (*C1->ops->productsymbolic)(C1);

946:   if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);
947:   else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */

949:   /* C2 = Ao^t * B */
950:   MatProductCreate(Ao,B,NULL,&C2);
951:   MatProductSetType(C2,MATPRODUCT_AtB);
952:   MatProductSetFill(C2,product->fill);
953:   C2->product->api_user = product->api_user;
954:   MatProductSetFromOptions(C2);
956:   (*C2->ops->productsymbolic)(C2);

958:   MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);

960:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
961:   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
962:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
963:   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
964:   mm->C1 = C1;
965:   mm->C2 = C2;
966:   return 0;
967: }

969: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
970: {
971:   Mat_Product                   *product = C->product;
972:   MatProductType                ptype;
973:   MatProductData_MPIAIJKokkos   *mmdata;
974:   MatMatStruct                  *mm = NULL;
975:   MatMatStruct_AB               *ab;
976:   MatMatStruct_AtB              *atb;
977:   Mat                           A,B,Ad,Ao,Bd,Bo;
978:   const MatScalarType           one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */

980:   MatCheckProduct(C,1);
981:   mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
982:   ptype  = product->type;
983:   A      = product->A;
984:   B      = product->B;
985:   Ad     = static_cast<Mat_MPIAIJ*>(A->data)->A;
986:   Ao     = static_cast<Mat_MPIAIJ*>(A->data)->B;
987:   Bd     = static_cast<Mat_MPIAIJ*>(B->data)->A;
988:   Bo     = static_cast<Mat_MPIAIJ*>(B->data)->B;

990:   if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
991:     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
992:     ab  = mmdata->mmAB;
993:     atb = mmdata->mmAtB;
994:     if (ab) {
995:       static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
996:       static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
997:     }
998:     if (atb) {
999:       static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1000:       static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1001:     }
1002:     return 0;
1003:   }

1005:   if (ptype == MATPRODUCT_AB) {
1006:     ab   = mmdata->mmAB;
1007:     /* C1 = Ad * B_local */
1009:     MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);
1011:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,ab->C1);
1012:     (*ab->C1->ops->productnumeric)(ab->C1);
1013:     PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1014:                                  ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1015:     /* C2 = Ao * B_other */
1017:     if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,ab->C2);
1018:     (*ab->C2->ops->productnumeric)(ab->C2);
1019:     /* C = C1_global + C2_global */
1020:     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1021:     mm = static_cast<MatMatStruct*>(ab);
1022:   } else if (ptype == MATPRODUCT_AtB) {
1023:     atb  = mmdata->mmAtB;
1025:     /* C1 = Ad^t * B_local */
1026:     MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);
1028:     if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,atb->C1);
1029:     (*atb->C1->ops->productnumeric)(atb->C1);

1031:     /* C2 = Ao^t * B_local */
1033:     if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,atb->C2);
1034:     (*atb->C2->ops->productnumeric)(atb->C2);
1035:     /* Form C2_global */
1036:     PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1037:                                   atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1038:     /* C = C1_global + C2_global */
1039:     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1040:     mm = static_cast<MatMatStruct*>(atb);
1041:   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1042:     ab   = mmdata->mmAB;
1043:     MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);

1045:     /* ab->C1 = Ad * B_local */
1047:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,ab->C1);
1048:     (*ab->C1->ops->productnumeric)(ab->C1);
1049:     PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1050:                                  ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1051:     /* ab->C2 = Ao * B_other */
1052:     if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,ab->C2);
1053:     (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1054:     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);

1056:     /* atb->C1 = Bd^t * ab->C_petsc */
1057:     atb  = mmdata->mmAtB;
1059:     if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd,NULL,NULL,atb->C1);
1060:     (*atb->C1->ops->productnumeric)(atb->C1);
1061:     /* atb->C2 = Bo^t * ab->C_petsc */
1062:     if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo,NULL,NULL,atb->C2);
1063:     (*atb->C2->ops->productnumeric)(atb->C2);
1064:     PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1065:                                   atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1066:     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1067:     mm = static_cast<MatMatStruct*>(atb);
1068:   }
1069:   /* Split C_global to form C */
1070:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);
1071:   return 0;
1072: }

1074: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1075: {
1076:   Mat                         A,B;
1077:   Mat_Product                 *product = C->product;
1078:   MatProductType              ptype;
1079:   MatProductData_MPIAIJKokkos *mmdata;
1080:   MatMatStruct                *mm = NULL;
1081:   IS                          glob = NULL;
1082:   const PetscInt              *garray;
1083:   PetscInt                    m,n,M,N,sz;
1084:   ConstMatColIdxKokkosView    l2g; /* map local col ids to global ones */

1086:   MatCheckProduct(C,1);
1088:   ptype = product->type;
1089:   A     = product->A;
1090:   B     = product->B;

1092:   switch (ptype) {
1093:     case MATPRODUCT_AB:   m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1094:     case MATPRODUCT_AtB:  m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1095:     case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
1096:     default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1097:   }

1099:   MatSetSizes(C,m,n,M,N);
1100:   PetscLayoutSetUp(C->rmap);
1101:   PetscLayoutSetUp(C->cmap);
1102:   MatSetType(C,((PetscObject)A)->type_name);

1104:   mmdata           = new MatProductData_MPIAIJKokkos();
1105:   mmdata->reusesym = product->api_user;

1107:   if (ptype == MATPRODUCT_AB) {
1108:     mmdata->mmAB = new MatMatStruct_AB();
1109:     MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);
1110:     mm   = static_cast<MatMatStruct*>(mmdata->mmAB);
1111:   } else if (ptype == MATPRODUCT_AtB) {
1112:     mmdata->mmAtB = new MatMatStruct_AtB();
1113:     auto atb      = mmdata->mmAtB;
1114:     MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);
1115:     ISGetIndices(glob,&garray);
1116:     ISGetSize(glob,&sz);
1117:     l2g  = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
1118:     MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);
1119:     ISRestoreIndices(glob,&garray);
1120:     ISDestroy(&glob);
1121:     mm   = static_cast<MatMatStruct*>(atb);
1122:   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1123:     mmdata->mmAB  = new MatMatStruct_AB(); /* tmp=A*B */
1124:     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1125:     auto ab       = mmdata->mmAB;
1126:     auto atb      = mmdata->mmAtB;
1127:     MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);
1128:     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1129:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);
1130:     MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);
1131:     mm   = static_cast<MatMatStruct*>(atb);
1132:   }
1133:   /* Split the C_global into petsc A, B format */
1134:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);
1135:   C->product->data        = mmdata;
1136:   C->product->destroy     = MatProductDataDestroy_MPIAIJKokkos;
1137:   C->ops->productnumeric  = MatProductNumeric_MPIAIJKokkos;
1138:   return 0;
1139: }

1141: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1142: {
1144:   Mat_Product    *product = mat->product;
1145:   PetscBool      match = PETSC_FALSE;
1146:   PetscBool      usecpu = PETSC_FALSE;

1148:   MatCheckProduct(mat,1);
1149:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
1150:     PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);
1151:   }
1152:   if (match) { /* we can always fallback to the CPU if requested */
1153:     switch (product->type) {
1154:     case MATPRODUCT_AB:
1155:       if (product->api_user) {
1156:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
1157:         PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
1158:         PetscOptionsEnd();
1159:       } else {
1160:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
1161:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
1162:         PetscOptionsEnd();
1163:       }
1164:       break;
1165:     case MATPRODUCT_AtB:
1166:       if (product->api_user) {
1167:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
1168:         PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
1169:         PetscOptionsEnd();
1170:       } else {
1171:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
1172:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
1173:         PetscOptionsEnd();
1174:       }
1175:       break;
1176:     case MATPRODUCT_PtAP:
1177:       if (product->api_user) {
1178:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
1179:         PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
1180:         PetscOptionsEnd();
1181:       } else {
1182:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
1183:         PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
1184:         PetscOptionsEnd();
1185:       }
1186:       break;
1187:     default:
1188:       break;
1189:     }
1190:     match = (PetscBool)!usecpu;
1191:   }
1192:   if (match) {
1193:     switch (product->type) {
1194:     case MATPRODUCT_AB:
1195:     case MATPRODUCT_AtB:
1196:     case MATPRODUCT_PtAP:
1197:       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1198:       break;
1199:     default:
1200:       break;
1201:     }
1202:   }
1203:   /* fallback to MPIAIJ ops */
1204:   if (!mat->ops->productsymbolic) {
1205:     MatProductSetFromOptions_MPIAIJ(mat);
1206:   }
1207:   return 0;
1208: }

1210: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1211: {
1212:   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)mat->data;
1213:   Mat_MPIAIJKokkos *mpikok;

1215:   MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);
1216:   mat->preallocated = PETSC_TRUE;
1217:   MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);
1218:   MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);
1219:   MatZeroEntries(mat);
1220:   mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1221:   delete mpikok;
1222:   mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1223:   return 0;
1224: }

1226: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode)
1227: {
1228:   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
1229:   Mat_MPIAIJKokkos               *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1230:   Mat                            A = mpiaij->A,B = mpiaij->B;
1231:   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
1232:   MatScalarKokkosView            Aa,Ba;
1233:   MatScalarKokkosView            v1;
1234:   MatScalarKokkosView&           vsend = mpikok->sendbuf_d;
1235:   const MatScalarKokkosView&     v2 = mpikok->recvbuf_d;
1236:   const PetscCountKokkosView&    Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d;
1237:   const PetscCountKokkosView&    Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d;
1238:   const PetscCountKokkosView&    Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d;
1239:   const PetscCountKokkosView&    Cperm1 = mpikok->Cperm1_d;
1240:   PetscMemType                   memtype;

1242:   PetscGetMemType(v,&memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1243:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1244:     v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n));
1245:   } else {
1246:     v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */
1247:   }

1249:   if (imode == INSERT_VALUES) {
1250:     MatSeqAIJGetKokkosViewWrite(A,&Aa); /* write matrix values */
1251:     MatSeqAIJGetKokkosViewWrite(B,&Ba);
1252:     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1253:     Kokkos::deep_copy(Ba,0.0);
1254:   } else {
1255:     MatSeqAIJGetKokkosView(A,&Aa); /* read & write matrix values */
1256:     MatSeqAIJGetKokkosView(B,&Ba);
1257:   }

1259:   /* Pack entries to be sent to remote */
1260:   Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));});

1262:   /* Send remote entries to their owner and overlap the communication with local computation */
1263:   PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE);
1264:   /* Add local entries to A and B */
1265:   Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));});
1266:   Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));});
1267:   PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE);

1269:   /* Add received remote entries to A and B */
1270:   Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));});
1271:   Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));});

1273:   if (imode == INSERT_VALUES) {
1274:     MatSeqAIJRestoreKokkosViewWrite(A,&Aa); /* Increase A & B's state etc. */
1275:     MatSeqAIJRestoreKokkosViewWrite(B,&Ba);
1276:   } else {
1277:     MatSeqAIJRestoreKokkosView(A,&Aa);
1278:     MatSeqAIJRestoreKokkosView(B,&Ba);
1279:   }
1280:   return 0;
1281: }

1283: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1284: {
1285:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;

1287:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
1288:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
1289:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",   NULL);
1290:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",          NULL);
1291:   delete (Mat_MPIAIJKokkos*)mpiaij->spptr;
1292:   MatDestroy_MPIAIJ(A);
1293:   return 0;
1294: }

1296: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1297: {
1298:   Mat                B;
1299:   Mat_MPIAIJ         *a;

1301:   if (reuse == MAT_INITIAL_MATRIX) {
1302:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
1303:   } else if (reuse == MAT_REUSE_MATRIX) {
1304:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
1305:   }
1306:   B = *newmat;

1308:   B->boundtocpu = PETSC_FALSE;
1309:   PetscFree(B->defaultvectype);
1310:   PetscStrallocpy(VECKOKKOS,&B->defaultvectype);
1311:   PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);

1313:   a = static_cast<Mat_MPIAIJ*>(A->data);
1314:   if (a->A) MatSetType(a->A,MATSEQAIJKOKKOS);
1315:   if (a->B) MatSetType(a->B,MATSEQAIJKOKKOS);
1316:   if (a->lvec) VecSetType(a->lvec,VECSEQKOKKOS);

1318:   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1319:   B->ops->mult                  = MatMult_MPIAIJKokkos;
1320:   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1321:   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1322:   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1323:   B->ops->destroy               = MatDestroy_MPIAIJKokkos;

1325:   PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);
1326:   PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1327:   PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C",   MatSetPreallocationCOO_MPIAIJKokkos);
1328:   PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C",          MatSetValuesCOO_MPIAIJKokkos);
1329:   return 0;
1330: }
1331: /*MC
1332:    MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

1334:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

1336:    Options Database Keys:
1337: .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()

1339:   Level: beginner

1341: .seealso: MatCreateAIJKokkos(), MATSEQAIJKOKKOS
1342: M*/
1343: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1344: {
1345:   PetscKokkosInitializeCheck();
1346:   MatCreate_MPIAIJ(A);
1347:   MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);
1348:   return 0;
1349: }

1351: /*@C
1352:    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1353:    (the default parallel PETSc format).  This matrix will ultimately pushed down
1354:    to Kokkos for calculations. For good matrix
1355:    assembly performance the user should preallocate the matrix storage by setting
1356:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1357:    performance during matrix assembly can be increased by more than a factor of 50.

1359:    Collective

1361:    Input Parameters:
1362: +  comm - MPI communicator, set to PETSC_COMM_SELF
1363: .  m - number of rows
1364: .  n - number of columns
1365: .  nz - number of nonzeros per row (same for all rows)
1366: -  nnz - array containing the number of nonzeros in the various rows
1367:          (possibly different for each row) or NULL

1369:    Output Parameter:
1370: .  A - the matrix

1372:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1373:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1374:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1376:    Notes:
1377:    If nnz is given then nz is ignored

1379:    The AIJ format (also called the Yale sparse matrix format or
1380:    compressed row storage), is fully compatible with standard Fortran 77
1381:    storage.  That is, the stored row and column indices can begin at
1382:    either one (as in Fortran) or zero.  See the users' manual for details.

1384:    Specify the preallocated storage with either nz or nnz (not both).
1385:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1386:    allocation.  For large problems you MUST preallocate memory or you
1387:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1389:    By default, this format uses inodes (identical nodes) when possible, to
1390:    improve numerical efficiency of matrix-vector products and solves. We
1391:    search for consecutive rows with the same nonzero structure, thereby
1392:    reusing matrix information to achieve increased efficiency.

1394:    Level: intermediate

1396: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
1397: @*/
1398: PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1399: {
1400:   PetscMPIInt    size;

1402:   MatCreate(comm,A);
1403:   MatSetSizes(*A,m,n,M,N);
1404:   MPI_Comm_size(comm,&size);
1405:   if (size > 1) {
1406:     MatSetType(*A,MATMPIAIJKOKKOS);
1407:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
1408:   } else {
1409:     MatSetType(*A,MATSEQAIJKOKKOS);
1410:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
1411:   }
1412:   return 0;
1413: }

1415: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1416: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1417: {
1418:   PetscMPIInt                size,rank;
1419:   MPI_Comm                   comm;
1420:   PetscSplitCSRDataStructure d_mat=NULL;

1422:   PetscObjectGetComm((PetscObject)A,&comm);
1423:   MPI_Comm_size(comm,&size);
1424:   MPI_Comm_rank(comm,&rank);
1425:   if (size == 1) {
1426:     MatSeqAIJKokkosGetDeviceMat(A,&d_mat);
1427:     MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1428:   } else {
1429:     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1430:     MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);
1431:     MatSeqAIJKokkosModifyDevice(aij->A);
1432:     MatSeqAIJKokkosModifyDevice(aij->B);
1434:   }
1435:   // act like MatSetValues because not called on host
1436:   if (A->assembled) {
1437:     if (A->was_assembled) {
1438:       PetscInfo(A,"Assemble more than once already\n");
1439:     }
1440:     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1441:   } else {
1442:     PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);
1443:   }
1444:   if (!d_mat) {
1445:     struct _n_SplitCSRMat h_mat; /* host container */
1446:     Mat_SeqAIJKokkos      *aijkokA;
1447:     Mat_SeqAIJ            *jaca;
1448:     PetscInt              n = A->rmap->n, nnz;
1449:     Mat                   Amat;
1450:     PetscInt              *colmap;

1452:     /* create and copy h_mat */
1453:     h_mat.M = A->cmap->N; // use for debug build
1454:     PetscInfo(A,"Create device matrix in Kokkos\n");
1455:     if (size == 1) {
1456:       Amat = A;
1457:       jaca = (Mat_SeqAIJ*)A->data;
1458:       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1459:       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1460:       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1461:       h_mat.offdiag.a = NULL;
1462:       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1463:     } else {
1464:       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
1465:       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
1466:       PetscInt         ii;
1467:       Mat_SeqAIJKokkos *aijkokB;

1469:       Amat = aij->A;
1470:       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1471:       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1472:       jaca = (Mat_SeqAIJ*)aij->A->data;
1475:       aij->donotstash = PETSC_TRUE;
1476:       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1477:       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1478:       PetscCalloc1(A->cmap->N,&colmap);
1479:       PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));
1480:       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1481:       // allocate B copy data
1482:       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1483:       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1484:       nnz = jacb->i[n];
1485:       if (jacb->compressedrow.use) {
1486:         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
1487:         aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1488:         Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k);
1489:         h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1490:       } else {
1491:          h_mat.offdiag.i = aijkokB->i_device_data();
1492:       }
1493:       h_mat.offdiag.j = aijkokB->j_device_data();
1494:       h_mat.offdiag.a = aijkokB->a_device_data();
1495:       {
1496:         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
1497:         aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1498:         Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k);
1499:         h_mat.colmap = aijkokB->colmap_d.data();
1500:         PetscFree(colmap);
1501:       }
1502:       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1503:       h_mat.offdiag.n = n;
1504:     }
1505:     // allocate A copy data
1506:     nnz = jaca->i[n];
1507:     h_mat.diag.n = n;
1508:     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1509:     MPI_Comm_rank(comm,&h_mat.rank);
1511:     else {
1512:       h_mat.diag.i = aijkokA->i_device_data();
1513:     }
1514:     h_mat.diag.j = aijkokA->j_device_data();
1515:     h_mat.diag.a = aijkokA->a_device_data();
1516:     // copy pointers and metdata to device
1517:     MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);
1518:     MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);
1519:     PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);
1520:   }
1521:   *B = d_mat; // return it, set it in Mat, and set it up
1522:   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1523:   return 0;
1524: }

1526: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1527: {
1528:   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);

1530:   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1531:   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1532:   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1533:   else *mask = "PETSC_OFFLOAD_BOTH";
1534:   return 0;
1535: }

1537: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1538: {
1539:   PetscMPIInt  size;
1540:   Mat          Ad,Ao;
1541:   const char  *amask,*bmask;

1543:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);

1545:   if (size == 1) {
1546:     MatSeqAIJKokkosGetOffloadMask(A,&amask);
1547:     PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);
1548:   } else {
1549:     Ad  = ((Mat_MPIAIJ*)A->data)->A;
1550:     Ao  = ((Mat_MPIAIJ*)A->data)->B;
1551:     MatSeqAIJKokkosGetOffloadMask(Ad,&amask);
1552:     MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);
1553:     PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);
1554:   }
1555:   return 0;
1556: }