Actual source code: mpiaijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscsf.h>
3: #include <petsc/private/sfimpl.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
6: #include <KokkosSparse_spadd.hpp>
8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
9: {
10: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
11: Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
13: MatAssemblyEnd_MPIAIJ(A,mode);
14: if (aijkok && aijkok->device_mat_d.data()) {
15: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
16: }
18: return 0;
19: }
21: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
22: {
23: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
25: PetscLayoutSetUp(mat->rmap);
26: PetscLayoutSetUp(mat->cmap);
27: #if defined(PETSC_USE_DEBUG)
28: if (d_nnz) {
29: PetscInt i;
30: for (i=0; i<mat->rmap->n; i++) {
32: }
33: }
34: if (o_nnz) {
35: PetscInt i;
36: for (i=0; i<mat->rmap->n; i++) {
38: }
39: }
40: #endif
41: #if defined(PETSC_USE_CTABLE)
42: PetscTableDestroy(&mpiaij->colmap);
43: #else
44: PetscFree(mpiaij->colmap);
45: #endif
46: PetscFree(mpiaij->garray);
47: VecDestroy(&mpiaij->lvec);
48: VecScatterDestroy(&mpiaij->Mvctx);
49: /* Because the B will have been resized we simply destroy it and create a new one each time */
50: MatDestroy(&mpiaij->B);
52: if (!mpiaij->A) {
53: MatCreate(PETSC_COMM_SELF,&mpiaij->A);
54: MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);
55: PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);
56: }
57: if (!mpiaij->B) {
58: PetscMPIInt size;
59: MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);
60: MatCreate(PETSC_COMM_SELF,&mpiaij->B);
61: MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);
62: PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);
63: }
64: MatSetType(mpiaij->A,MATSEQAIJKOKKOS);
65: MatSetType(mpiaij->B,MATSEQAIJKOKKOS);
66: MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);
67: MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);
68: mat->preallocated = PETSC_TRUE;
69: return 0;
70: }
72: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
73: {
74: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
75: PetscInt nt;
77: VecGetLocalSize(xx,&nt);
79: VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
80: (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);
81: VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
82: (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);
83: return 0;
84: }
86: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
87: {
88: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
89: PetscInt nt;
91: VecGetLocalSize(xx,&nt);
93: VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
94: (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);
95: VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);
96: (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);
97: return 0;
98: }
100: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
101: {
102: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
103: PetscInt nt;
105: VecGetLocalSize(xx,&nt);
107: (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);
108: (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);
109: VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
110: VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
111: return 0;
112: }
114: /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
115: A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
116: C still uses local column ids. Their corresponding global column ids are returned in glob.
117: */
118: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
119: {
120: Mat Ad,Ao;
121: const PetscInt *cmap;
123: MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);
124: MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);
125: if (glob) {
126: PetscInt cst, i, dn, on, *gidx;
127: MatGetLocalSize(Ad,NULL,&dn);
128: MatGetLocalSize(Ao,NULL,&on);
129: MatGetOwnershipRangeColumn(mat,&cst,NULL);
130: PetscMalloc1(dn+on,&gidx);
131: for (i=0; i<dn; i++) gidx[i] = cst + i;
132: for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
133: ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
134: }
135: return 0;
136: }
138: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
139: struct MatMatStruct {
140: MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */
141: PetscSF sf; /* SF to send/recv matrix entries */
142: MatScalarKokkosView abuf; /* buf of mat values in send/recv */
143: Mat C1,C2,B_local;
144: KokkosCsrMatrix C1_global,C2_global,C_global;
145: KernelHandle kh;
146: MatMatStruct() {
147: C1 = C2 = B_local = NULL;
148: sf = NULL;
149: }
151: ~MatMatStruct() {
152: MatDestroy(&C1);
153: MatDestroy(&C2);
154: MatDestroy(&B_local);
155: PetscSFDestroy(&sf);
156: kh.destroy_spadd_handle();
157: }
158: };
160: struct MatMatStruct_AB : public MatMatStruct {
161: MatColIdxKokkosView rows;
162: MatRowMapKokkosView rowoffset;
163: Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
165: MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
166: ~MatMatStruct_AB() {
167: MatDestroy(&B_other);
168: MatDestroy(&C_petsc);
169: }
170: };
172: struct MatMatStruct_AtB : public MatMatStruct {
173: MatRowMapKokkosView srcrowoffset,dstrowoffset;
174: };
176: struct MatProductData_MPIAIJKokkos
177: {
178: MatMatStruct_AB *mmAB;
179: MatMatStruct_AtB *mmAtB;
180: PetscBool reusesym;
182: MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
183: ~MatProductData_MPIAIJKokkos() {
184: delete mmAB;
185: delete mmAtB;
186: }
187: };
189: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
190: {
191: delete static_cast<MatProductData_MPIAIJKokkos*>(data);
192: return 0;
193: }
195: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
197: Input Parameters:
198: + A - the MATSEQAIJKOKKOS matrix
199: . N - new column size for the returned Kokkos matrix
200: - l2g - a map that maps old col ids to new col ids
202: Output Parameters:
203: . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
204: */
205: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
206: {
207: KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
208: MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */
210: Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));});
211: csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg);
212: return 0;
213: }
215: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
216: It is similar to MatCreateMPIAIJWithSplitArrays.
218: Input Parameters:
219: + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
220: . A - the diag matrix using local col ids
221: - B - the offdiag matrix using global col ids
223: Output Parameters:
224: . mat - the updated MATMPIAIJKOKKOS matrix
225: */
226: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
227: {
228: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
229: PetscInt m,n,M,N,Am,An,Bm,Bn;
230: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
232: MatGetSize(mat,&M,&N);
233: MatGetLocalSize(mat,&m,&n);
234: MatGetLocalSize(A,&Am,&An);
235: MatGetLocalSize(B,&Bm,&Bn);
241: mpiaij->A = A;
242: mpiaij->B = B;
244: mat->preallocated = PETSC_TRUE;
245: mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
247: MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);
248: MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);
249: /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
250: also gets mpiaij->B compacted, with its col ids and size reduced
251: */
252: MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);
253: MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);
254: MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);
256: /* Update bkok with new local col ids (stored on host) and size */
257: bkok->j_dual.modify_host();
258: bkok->j_dual.sync_device();
259: bkok->SetColSize(mpiaij->B->cmap->n);
260: return 0;
261: }
263: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
265: It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
266: In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
267: Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
268: to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
270: Collective on comm of ownerSF
272: Input Parameters:
273: + B - the SEQAIJKOKKOS matrix, using local col ids
274: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
275: . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
276: . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
277: . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
279: Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
280: + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
281: . abuf - buffer for sending matrix values
282: . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
283: Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
284: . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
285: - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
286: */
287: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
288: PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
289: MatRowMapKokkosView& rowoffset,Mat& C)
290: {
291: Mat_SeqAIJKokkos *bkok,*ckok;
293: MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
294: bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
296: if (reuse == MAT_REUSE_MATRIX) {
297: ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
299: const auto& Ba = bkok->a_dual.view_device();
300: const auto& Bi = bkok->i_dual.view_device();
301: const auto& Ca = ckok->a_dual.view_device();
303: /* Copy Ba to abuf */
304: Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
305: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
306: PetscInt r = rows(i);
307: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
308: Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
309: abuf(base+k) = Ba(Bi(r)+k);
310: });
311: });
313: /* Send abuf to Ca through bcastSF and then mark C is updated on device */
314: PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE); /* TODO: get memtype for abuf */
315: PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);
316: ckok->a_dual.modify_device();
317: } else if (reuse == MAT_INITIAL_MATRIX) {
318: MPI_Comm comm;
319: PetscMPIInt tag;
320: PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;
322: PetscObjectGetComm((PetscObject)ownerSF,&comm);
323: PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);
324: Cm = nleaves; /* row size of C */
325: Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */
327: /* Get row lens (nz) of B's rows for later fast query */
328: PetscInt *Browlens;
329: const PetscInt *tmp = bkok->i_host_data();
330: PetscMalloc1(nroots,&Browlens);
331: for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];
333: /* By ownerSF, each proc gets lens of rows of C */
334: MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
335: Ci_h = Ci.view_host().data();
336: Ci_h[0] = 0;
337: PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);
338: PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);
339: for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
340: Cnnz = Ci_h[Cm];
341: Ci.modify_host();
342: Ci.sync_device();
344: /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
345: MatColIdxKokkosDualView Cj("j",Cnnz);
346: MatScalarKokkosDualView Ca("a",Cnnz);
348: /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
349: const PetscMPIInt *iranks,*ranks;
350: const PetscInt *ioffset,*irootloc,*roffset;
351: PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
352: MPI_Request *reqs;
354: PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc); /* irootloc[] contains indices of rows I need to send to each receiver */
355: PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/); /* recv info */
357: /* figure out offsets at the send buffer, to build the SF
358: sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
359: rowptr[] - stores offsets for data of each row in abuf
361: rdisp[] - to receive sdisp[]
362: */
363: PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);
364: MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
365: rowptr = rowptr_h.data();
367: sdisp[0] = 0;
368: rowptr[0] = 0;
369: for (i=0; i<niranks; i++) { /* for each receiver */
370: PetscInt len, nz = 0;
371: for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
372: len = Browlens[irootloc[j]];
373: rowptr[j+1] = rowptr[j] + len;
374: nz += len;
375: }
376: sdisp[i+1] = sdisp[i] + nz;
377: }
378: PetscCommGetNewTag(comm,&tag);
379: for (i=0; i<nranks; i++) MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);
380: for (i=0; i<niranks; i++) MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);
381: MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);
383: PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
384: PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
385: PetscSFNode *iremote;
386: PetscMalloc1(nleaves2,&iremote);
387: for (i=0; i<nranks; i++) { /* for each sender */
388: k = 0;
389: for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
390: iremote[j].rank = ranks[i];
391: iremote[j].index = rdisp[i] + k;
392: k++;
393: }
394: }
395: /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
396: PetscSFCreate(comm,&bcastSF);
397: PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
399: /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
400: from local to global. Then use bcastSF to fill Ca, Cj.
401: */
402: ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
403: MatColIdxKokkosView rows("rows",ioffset[niranks]);
404: Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
406: rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
408: MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
409: abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */
411: const auto& Ba = bkok->a_dual.view_device();
412: const auto& Bi = bkok->i_dual.view_device();
413: const auto& Bj = bkok->j_dual.view_device();
415: /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
416: Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
417: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
418: PetscInt r = rows(i);
419: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
420: Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
421: abuf(base+k) = Ba(Bi(r)+k);
422: jbuf(base+k) = l2g(Bj(Bi(r)+k));
423: });
424: });
426: /* Send abuf & jbuf to fill Ca, Cj */
427: PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);
428: PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);
429: PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);
430: PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);
431: Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
432: Cj.sync_host();
433: Ca.modify_device();
435: /* Construct C with Ca, Ci, Cj */
436: auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
437: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);
438: PetscFree3(sdisp,rdisp,reqs);
439: PetscFree(Browlens);
440: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
441: return 0;
442: }
444: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
446: It is the reverse of MatSeqAIJKokkosBcast in some sense.
448: Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
449: In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
450: contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
452: Input Parameters:
453: + A - the SEQAIJKOKKOS matrix to be reduced
454: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
455: . local - true if A uses local col ids; false if A is already in global col ids.
456: . N - if local, N is A's global col size
457: . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
458: - ownerSF - the SF specifies ownership (root) of rows in A
460: Output Parameters:
461: + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side
462: . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
463: . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
464: . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
465: unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
466: - C - the matrix made up by rows sent to me from other ranks, using global col ids
468: TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
469: */
470: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
471: PetscSF& reduceSF,MatScalarKokkosView& abuf,
472: MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
473: KokkosCsrMatrix& C)
474: {
475: PetscInt i,r,Am,An,Annz,Cnnz,nrows;
476: const PetscInt *Ai;
477: Mat_SeqAIJKokkos *akok;
479: MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
480: MatGetSize(A,&Am,&An);
481: Ai = static_cast<Mat_SeqAIJ*>(A->data)->i;
482: akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
483: Annz = Ai[Am];
485: if (reuse == MAT_REUSE_MATRIX) {
486: /* Send Aa to abuf */
487: PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
488: PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
490: /* Copy abuf to Ca */
491: const MatScalarKokkosView& Ca = C.values;
492: nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
493: Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
494: PetscInt i = t.league_rank();
495: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
496: PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
497: Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
498: });
499: } else if (reuse == MAT_INITIAL_MATRIX) {
500: MPI_Comm comm;
501: MPI_Request *reqs;
502: PetscMPIInt tag;
503: PetscInt Cm;
505: PetscObjectGetComm((PetscObject)ownerSF,&comm);
506: PetscCommGetNewTag(comm,&tag);
508: PetscInt niranks,nranks,nroots,nleaves;
509: const PetscMPIInt *iranks,*ranks;
510: const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
511: PetscSFSetUp(ownerSF);
512: PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows); /* recv info: iranks[] will send rows to me */
513: PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/); /* send info */
514: PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);
516: Cm = nroots;
517: nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
519: /* Tell owners how long each row I will send */
520: PetscInt *srowlens; /* send buf of row lens */
521: MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
522: PetscInt *rrowlens = rrowlens_h.data();
524: PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);
525: for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
526: rrowlens[0] = 0;
527: rrowlens++; /* shift the pointer to make the following expression more readable */
528: for (i=0; i<niranks; i++)MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);
529: for (i=0; i<nranks; i++) MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);
530: MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);
532: /* Owner builds Ci on host by histogramming rrowlens[] */
533: MatRowMapKokkosViewHost Ci_h("i",Cm+1);
534: Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
535: MatRowMapType *Ci_ptr = Ci_h.data();
537: for (i=0; i<nrows; i++) {
538: r = rows[i]; /* local row id of i-th received row */
539: #if defined(PETSC_USE_DEBUG)
541: #endif
542: Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
543: }
544: for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
545: Cnnz = Ci_ptr[Cm];
547: /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
548: MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
549: PetscInt *dstrowoffset_hptr = dstrowoffset_h.data();
550: PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
552: PetscCalloc1(Cm,&currowlens); /* Init with zero, to be added to */
553: for (i=0; i<nrows; i++) { /* for each row I receive */
554: r = rows[i]; /* row id in C */
555: dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
556: currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */
557: }
558: PetscFree(currowlens);
560: rrowlens--;
561: for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
562: dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
563: srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
565: /* Build the reduceSF, which performs buffer to buffer send/recv */
566: PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
567: PetscMalloc2(niranks,&sdisp,nranks,&rdisp);
568: for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
569: for (i=0; i<nranks; i++) MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);
570: for (i=0; i<niranks; i++) MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);
571: MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);
573: /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
574: PetscInt nroots2 = Cnnz,nleaves2 = Annz;
575: PetscSFNode *iremote;
576: PetscMalloc1(nleaves2,&iremote); /* no free, since memory will be given to reduceSF */
577: for (i=0; i<nranks; i++) {
578: PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
579: PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
580: PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
581: for (PetscInt k=0; k<nz; k++) {
582: iremote[leafbase+k].rank = ranks[i];
583: iremote[leafbase+k].index = rootbase + k;
584: }
585: }
586: PetscSFCreate(comm,&reduceSF);
587: PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
588: PetscFree2(sdisp,rdisp);
590: /* Reduce Aa, Ajg to abuf and jbuf */
592: /* If A uses local col ids, convert them to global ones before sending */
593: MatColIdxKokkosView Ajg;
594: if (local) {
595: Ajg = MatColIdxKokkosView("j",Annz);
596: const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
597: Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
598: } else {
599: Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
600: }
602: MatColIdxKokkosView jbuf("jbuf",Cnnz);
603: abuf = MatScalarKokkosView("abuf",Cnnz);
604: PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);
605: PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);
606: PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
607: PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);
609: /* Copy data from abuf, jbuf to Ca, Cj */
610: MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
611: MatColIdxKokkosView Cj("j",Cnnz);
612: MatScalarKokkosView Ca("a",Cnnz);
614: Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
615: PetscInt i = t.league_rank();
616: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
617: PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
618: Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
619: Ca(dst+k) = abuf(src+k);
620: Cj(dst+k) = jbuf(src+k);
621: });
622: });
624: /* Build C with Ca, Ci, Cj */
625: C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
626: PetscFree2(srowlens,reqs);
627: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
628: return 0;
629: }
631: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix
633: Input Parameters:
634: + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
635: . reuse - indicate whether the matrix has called this function before
636: . csrmat - the KokkosCsrMatrix, of size m,N
637: - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
638: entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
639: entry is 5, then Cdstart[i] = 3.
641: Output Parameters:
642: + C - the updated MATMPIAIJKOKKOS matrix
643: - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
645: Notes:
646: Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
647: */
648: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
649: {
650: const MatScalarKokkosView& Ca = csrmat.values;
651: const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
652: PetscInt m,n,N;
654: MatGetLocalSize(C,&m,&n);
655: MatGetSize(C,NULL,&N);
657: if (reuse == MAT_REUSE_MATRIX) {
658: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
659: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
660: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
661: const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
662: const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();
664: /* Fill 'a' of Cd and Co on device */
665: Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
666: PetscInt i = t.league_rank(); /* row i */
667: PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */
668: PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
669: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
670: PetscInt cdend = cdstart + cdlen;
671: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
672: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
673: if (k < cdstart) { /* k in [0, cdstart) */
674: Coa(Coi(i)+k) = Ca(Ci(i)+k);
675: } else if (k < cdend) { /* k in [cdstart, cdend) */
676: Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
677: } else { /* k in [cdend, clen) */
678: Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
679: }
680: });
681: });
683: akok->a_dual.modify_device();
684: bkok->a_dual.modify_device();
685: } else if (reuse == MAT_INITIAL_MATRIX) {
686: Mat Cd,Co;
687: const MatColIdxKokkosView& Cj = csrmat.graph.entries;
688: MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1);
689: MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
690: PetscInt cstart,cend;
692: /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
693: left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
694: Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
695: such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
696: stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
697: */
698: Cdstart = MatRowMapKokkosView("Cdstart",m);
699: PetscLayoutGetRange(C->cmap,&cstart,&cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
701: /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
702: CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
703: */
704: Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
705: Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
706: PetscInt i = t.league_rank(); /* row i */
707: PetscInt j,first,count,step;
709: if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
710: Cdi(0) = 0;
711: Coi(0) = 0;
712: }
714: /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
715: in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
716: */
717: count = Ci(i+1)-Ci(i);
718: first = Ci(i);
719: while (count > 0) {
720: j = first;
721: step = count / 2;
722: j += step;
723: if (Cj(j) < cstart) {
724: first = ++j;
725: count -= step + 1;
726: } else count = step;
727: }
728: Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
730: /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
731: count = Ci(i+1) - first;
732: while (count > 0) {
733: j = first;
734: step = count / 2;
735: j += step;
736: if (Cj(j) < cend) {
737: first = ++j;
738: count -= step + 1;
739: } else count = step;
740: }
741: Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
742: Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
743: });
744: });
746: /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
747: Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
748: update += Cdi(i);
749: if (final) Cdi(i) = update;
750: });
751: Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
752: update += Coi(i);
753: if (final) Coi(i) = update;
754: });
756: /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
757: MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
758: */
759: Cdi_dual.modify_device();
760: Coi_dual.modify_device();
761: Cdi_dual.sync_host();
762: Coi_dual.sync_host();
763: PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
764: PetscInt Co_nnz = Coi_dual.view_host().data()[m];
766: /* With nnz, allocate a, j for Cd and Co */
767: MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
768: MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);
770: /* Fill a, j of Cd and Co on device */
771: MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
772: MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();
774: Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
775: PetscInt i = t.league_rank(); /* row i */
776: PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */
777: PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
778: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
779: PetscInt cdend = cdstart + cdlen;
780: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
781: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
782: if (k < cdstart) { /* k in [0, cdstart) */
783: Coa(Coi(i)+k) = Ca(Ci(i)+k);
784: Coj(Coi(i)+k) = Cj(Ci(i)+k);
785: } else if (k < cdend) { /* k in [cdstart, cdend) */
786: Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
787: Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
788: } else { /* k in [cdend, clen) */
789: Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
790: Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
791: }
792: });
793: });
795: Cdj_dual.modify_device();
796: Cda_dual.modify_device();
797: Coj_dual.modify_device();
798: Coa_dual.modify_device();
799: /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
800: auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
801: auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
802: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);
803: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);
804: MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co); /* Coj will be converted to local ids within */
805: }
806: return 0;
807: }
809: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
811: It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
813: Input Parameters:
814: + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
815: . reuse - indicate whether the matrix has called this function before
816: . csrmat - the KokkosCsrMatrix, of size m,N
817: - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
818: entry of the diag block of C in csrmat's j array.
820: Output Parameters:
821: + C - the updated MATMPIAIJKOKKOS matrix
822: - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
824: Notes: the input matrix's col ids and col size will be changed.
825: */
826: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
827: {
828: Mat_SeqAIJKokkos *ckok;
829: ISLocalToGlobalMapping l2gmap;
830: const PetscInt *garray;
831: PetscInt sz;
833: /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
834: MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);
835: ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
836: ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
837: ckok->j_dual.sync_device();
838: ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
840: /* Build l2g -- the local to global mapping of C's cols */
841: ISLocalToGlobalMappingGetIndices(l2gmap,&garray);
842: ISLocalToGlobalMappingGetSize(l2gmap,&sz);
845: ConstMatColIdxKokkosViewHost tmp(garray,sz);
846: l2g = MatColIdxKokkosView("l2g",sz);
847: Kokkos::deep_copy(l2g,tmp);
849: ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);
850: ISLocalToGlobalMappingDestroy(&l2gmap);
851: return 0;
852: }
854: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
856: Input Parameters:
857: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
858: . A - an MPIAIJKOKKOS matrix
859: . B - an MPIAIJKOKKOS matrix
860: - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
862: Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
863: */
864: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
865: {
866: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data);
867: Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */
868: IS glob = NULL;
869: const PetscInt *garray;
870: PetscInt N = B->cmap->N,sz;
871: ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */
872: MatColIdxKokkosView l2g2;
873: Mat C1,C2; /* intermediate matrices */
875: /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
876: MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);
877: MatProductCreate(Ad,mm->B_local,NULL,&C1);
878: MatProductSetType(C1,MATPRODUCT_AB);
879: MatProductSetFill(C1,product->fill);
880: C1->product->api_user = product->api_user;
881: MatProductSetFromOptions(C1);
883: (*C1->ops->productsymbolic)(C1);
885: ISGetIndices(glob,&garray);
886: ISGetSize(glob,&sz);
887: const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
888: l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
889: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global);
891: /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
892: MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,mm->abuf,mm->rows,mm->rowoffset,mm->B_other);
894: /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
895: MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);
896: MatProductCreate(Ao,mm->B_other,NULL,&C2);
897: MatProductSetType(C2,MATPRODUCT_AB);
898: MatProductSetFill(C2,product->fill);
899: C2->product->api_user = product->api_user;
900: MatProductSetFromOptions(C2);
902: (*C2->ops->productsymbolic)(C2);
903: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global);
905: /* C = C1 + C2. We actually use their global col ids versions in adding */
906: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
907: KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
908: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
909: KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
911: mm->C1 = C1;
912: mm->C2 = C2;
913: ISRestoreIndices(glob,&garray);
914: ISDestroy(&glob);
915: return 0;
916: }
918: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
920: Input Parameters:
921: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
922: . A - an MPIAIJKOKKOS matrix
923: . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
924: . localB - Does B use local col ids? If false, then B is already in global col ids.
925: . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
926: . l2g - If localB, then l2g maps B's local col ids to global ones.
927: - mm - a struct used to stash intermediate data in AtB
929: Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
930: */
931: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
932: {
933: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data);
934: Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */
935: Mat C1,C2; /* intermediate matrices */
937: /* C1 = Ad^t * B */
938: MatProductCreate(Ad,B,NULL,&C1);
939: MatProductSetType(C1,MATPRODUCT_AtB);
940: MatProductSetFill(C1,product->fill);
941: C1->product->api_user = product->api_user;
942: MatProductSetFromOptions(C1);
944: (*C1->ops->productsymbolic)(C1);
946: if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);
947: else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
949: /* C2 = Ao^t * B */
950: MatProductCreate(Ao,B,NULL,&C2);
951: MatProductSetType(C2,MATPRODUCT_AtB);
952: MatProductSetFill(C2,product->fill);
953: C2->product->api_user = product->api_user;
954: MatProductSetFromOptions(C2);
956: (*C2->ops->productsymbolic)(C2);
958: MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);
960: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
961: KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
962: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
963: KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
964: mm->C1 = C1;
965: mm->C2 = C2;
966: return 0;
967: }
969: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
970: {
971: Mat_Product *product = C->product;
972: MatProductType ptype;
973: MatProductData_MPIAIJKokkos *mmdata;
974: MatMatStruct *mm = NULL;
975: MatMatStruct_AB *ab;
976: MatMatStruct_AtB *atb;
977: Mat A,B,Ad,Ao,Bd,Bo;
978: const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
980: MatCheckProduct(C,1);
981: mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
982: ptype = product->type;
983: A = product->A;
984: B = product->B;
985: Ad = static_cast<Mat_MPIAIJ*>(A->data)->A;
986: Ao = static_cast<Mat_MPIAIJ*>(A->data)->B;
987: Bd = static_cast<Mat_MPIAIJ*>(B->data)->A;
988: Bo = static_cast<Mat_MPIAIJ*>(B->data)->B;
990: if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
991: mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
992: ab = mmdata->mmAB;
993: atb = mmdata->mmAtB;
994: if (ab) {
995: static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
996: static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
997: }
998: if (atb) {
999: static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1000: static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1001: }
1002: return 0;
1003: }
1005: if (ptype == MATPRODUCT_AB) {
1006: ab = mmdata->mmAB;
1007: /* C1 = Ad * B_local */
1009: MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);
1011: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,ab->C1);
1012: (*ab->C1->ops->productnumeric)(ab->C1);
1013: PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1014: ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1015: /* C2 = Ao * B_other */
1017: if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,ab->C2);
1018: (*ab->C2->ops->productnumeric)(ab->C2);
1019: /* C = C1_global + C2_global */
1020: KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1021: mm = static_cast<MatMatStruct*>(ab);
1022: } else if (ptype == MATPRODUCT_AtB) {
1023: atb = mmdata->mmAtB;
1025: /* C1 = Ad^t * B_local */
1026: MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);
1028: if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,atb->C1);
1029: (*atb->C1->ops->productnumeric)(atb->C1);
1031: /* C2 = Ao^t * B_local */
1033: if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,atb->C2);
1034: (*atb->C2->ops->productnumeric)(atb->C2);
1035: /* Form C2_global */
1036: PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1037: atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1038: /* C = C1_global + C2_global */
1039: KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1040: mm = static_cast<MatMatStruct*>(atb);
1041: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1042: ab = mmdata->mmAB;
1043: MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);
1045: /* ab->C1 = Ad * B_local */
1047: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad,NULL,NULL,ab->C1);
1048: (*ab->C1->ops->productnumeric)(ab->C1);
1049: PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1050: ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1051: /* ab->C2 = Ao * B_other */
1052: if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao,NULL,NULL,ab->C2);
1053: (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1054: KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1056: /* atb->C1 = Bd^t * ab->C_petsc */
1057: atb = mmdata->mmAtB;
1059: if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd,NULL,NULL,atb->C1);
1060: (*atb->C1->ops->productnumeric)(atb->C1);
1061: /* atb->C2 = Bo^t * ab->C_petsc */
1062: if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo,NULL,NULL,atb->C2);
1063: (*atb->C2->ops->productnumeric)(atb->C2);
1064: PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1065: atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1066: KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1067: mm = static_cast<MatMatStruct*>(atb);
1068: }
1069: /* Split C_global to form C */
1070: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);
1071: return 0;
1072: }
1074: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1075: {
1076: Mat A,B;
1077: Mat_Product *product = C->product;
1078: MatProductType ptype;
1079: MatProductData_MPIAIJKokkos *mmdata;
1080: MatMatStruct *mm = NULL;
1081: IS glob = NULL;
1082: const PetscInt *garray;
1083: PetscInt m,n,M,N,sz;
1084: ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */
1086: MatCheckProduct(C,1);
1088: ptype = product->type;
1089: A = product->A;
1090: B = product->B;
1092: switch (ptype) {
1093: case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1094: case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1095: case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
1096: default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1097: }
1099: MatSetSizes(C,m,n,M,N);
1100: PetscLayoutSetUp(C->rmap);
1101: PetscLayoutSetUp(C->cmap);
1102: MatSetType(C,((PetscObject)A)->type_name);
1104: mmdata = new MatProductData_MPIAIJKokkos();
1105: mmdata->reusesym = product->api_user;
1107: if (ptype == MATPRODUCT_AB) {
1108: mmdata->mmAB = new MatMatStruct_AB();
1109: MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);
1110: mm = static_cast<MatMatStruct*>(mmdata->mmAB);
1111: } else if (ptype == MATPRODUCT_AtB) {
1112: mmdata->mmAtB = new MatMatStruct_AtB();
1113: auto atb = mmdata->mmAtB;
1114: MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);
1115: ISGetIndices(glob,&garray);
1116: ISGetSize(glob,&sz);
1117: l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
1118: MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);
1119: ISRestoreIndices(glob,&garray);
1120: ISDestroy(&glob);
1121: mm = static_cast<MatMatStruct*>(atb);
1122: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1123: mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */
1124: mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1125: auto ab = mmdata->mmAB;
1126: auto atb = mmdata->mmAtB;
1127: MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);
1128: auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1129: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);
1130: MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);
1131: mm = static_cast<MatMatStruct*>(atb);
1132: }
1133: /* Split the C_global into petsc A, B format */
1134: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);
1135: C->product->data = mmdata;
1136: C->product->destroy = MatProductDataDestroy_MPIAIJKokkos;
1137: C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos;
1138: return 0;
1139: }
1141: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1142: {
1144: Mat_Product *product = mat->product;
1145: PetscBool match = PETSC_FALSE;
1146: PetscBool usecpu = PETSC_FALSE;
1148: MatCheckProduct(mat,1);
1149: if (!product->A->boundtocpu && !product->B->boundtocpu) {
1150: PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);
1151: }
1152: if (match) { /* we can always fallback to the CPU if requested */
1153: switch (product->type) {
1154: case MATPRODUCT_AB:
1155: if (product->api_user) {
1156: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
1157: PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
1158: PetscOptionsEnd();
1159: } else {
1160: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
1161: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
1162: PetscOptionsEnd();
1163: }
1164: break;
1165: case MATPRODUCT_AtB:
1166: if (product->api_user) {
1167: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
1168: PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
1169: PetscOptionsEnd();
1170: } else {
1171: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
1172: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
1173: PetscOptionsEnd();
1174: }
1175: break;
1176: case MATPRODUCT_PtAP:
1177: if (product->api_user) {
1178: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
1179: PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
1180: PetscOptionsEnd();
1181: } else {
1182: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
1183: PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
1184: PetscOptionsEnd();
1185: }
1186: break;
1187: default:
1188: break;
1189: }
1190: match = (PetscBool)!usecpu;
1191: }
1192: if (match) {
1193: switch (product->type) {
1194: case MATPRODUCT_AB:
1195: case MATPRODUCT_AtB:
1196: case MATPRODUCT_PtAP:
1197: mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1198: break;
1199: default:
1200: break;
1201: }
1202: }
1203: /* fallback to MPIAIJ ops */
1204: if (!mat->ops->productsymbolic) {
1205: MatProductSetFromOptions_MPIAIJ(mat);
1206: }
1207: return 0;
1208: }
1210: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1211: {
1212: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data;
1213: Mat_MPIAIJKokkos *mpikok;
1215: MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);
1216: mat->preallocated = PETSC_TRUE;
1217: MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);
1218: MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);
1219: MatZeroEntries(mat);
1220: mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1221: delete mpikok;
1222: mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1223: return 0;
1224: }
1226: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode)
1227: {
1228: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
1229: Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1230: Mat A = mpiaij->A,B = mpiaij->B;
1231: PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
1232: MatScalarKokkosView Aa,Ba;
1233: MatScalarKokkosView v1;
1234: MatScalarKokkosView& vsend = mpikok->sendbuf_d;
1235: const MatScalarKokkosView& v2 = mpikok->recvbuf_d;
1236: const PetscCountKokkosView& Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d;
1237: const PetscCountKokkosView& Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d;
1238: const PetscCountKokkosView& Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d;
1239: const PetscCountKokkosView& Cperm1 = mpikok->Cperm1_d;
1240: PetscMemType memtype;
1242: PetscGetMemType(v,&memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1243: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1244: v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n));
1245: } else {
1246: v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */
1247: }
1249: if (imode == INSERT_VALUES) {
1250: MatSeqAIJGetKokkosViewWrite(A,&Aa); /* write matrix values */
1251: MatSeqAIJGetKokkosViewWrite(B,&Ba);
1252: Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1253: Kokkos::deep_copy(Ba,0.0);
1254: } else {
1255: MatSeqAIJGetKokkosView(A,&Aa); /* read & write matrix values */
1256: MatSeqAIJGetKokkosView(B,&Ba);
1257: }
1259: /* Pack entries to be sent to remote */
1260: Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));});
1262: /* Send remote entries to their owner and overlap the communication with local computation */
1263: PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE);
1264: /* Add local entries to A and B */
1265: Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));});
1266: Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));});
1267: PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE);
1269: /* Add received remote entries to A and B */
1270: Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));});
1271: Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));});
1273: if (imode == INSERT_VALUES) {
1274: MatSeqAIJRestoreKokkosViewWrite(A,&Aa); /* Increase A & B's state etc. */
1275: MatSeqAIJRestoreKokkosViewWrite(B,&Ba);
1276: } else {
1277: MatSeqAIJRestoreKokkosView(A,&Aa);
1278: MatSeqAIJRestoreKokkosView(B,&Ba);
1279: }
1280: return 0;
1281: }
1283: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1284: {
1285: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
1287: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
1288: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
1289: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C", NULL);
1290: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);
1291: delete (Mat_MPIAIJKokkos*)mpiaij->spptr;
1292: MatDestroy_MPIAIJ(A);
1293: return 0;
1294: }
1296: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1297: {
1298: Mat B;
1299: Mat_MPIAIJ *a;
1301: if (reuse == MAT_INITIAL_MATRIX) {
1302: MatDuplicate(A,MAT_COPY_VALUES,newmat);
1303: } else if (reuse == MAT_REUSE_MATRIX) {
1304: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
1305: }
1306: B = *newmat;
1308: B->boundtocpu = PETSC_FALSE;
1309: PetscFree(B->defaultvectype);
1310: PetscStrallocpy(VECKOKKOS,&B->defaultvectype);
1311: PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);
1313: a = static_cast<Mat_MPIAIJ*>(A->data);
1314: if (a->A) MatSetType(a->A,MATSEQAIJKOKKOS);
1315: if (a->B) MatSetType(a->B,MATSEQAIJKOKKOS);
1316: if (a->lvec) VecSetType(a->lvec,VECSEQKOKKOS);
1318: B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos;
1319: B->ops->mult = MatMult_MPIAIJKokkos;
1320: B->ops->multadd = MatMultAdd_MPIAIJKokkos;
1321: B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos;
1322: B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1323: B->ops->destroy = MatDestroy_MPIAIJKokkos;
1325: PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);
1326: PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1327: PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);
1328: PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);
1329: return 0;
1330: }
1331: /*MC
1332: MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
1334: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
1336: Options Database Keys:
1337: . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
1339: Level: beginner
1341: .seealso: MatCreateAIJKokkos(), MATSEQAIJKOKKOS
1342: M*/
1343: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1344: {
1345: PetscKokkosInitializeCheck();
1346: MatCreate_MPIAIJ(A);
1347: MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);
1348: return 0;
1349: }
1351: /*@C
1352: MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1353: (the default parallel PETSc format). This matrix will ultimately pushed down
1354: to Kokkos for calculations. For good matrix
1355: assembly performance the user should preallocate the matrix storage by setting
1356: the parameter nz (or the array nnz). By setting these parameters accurately,
1357: performance during matrix assembly can be increased by more than a factor of 50.
1359: Collective
1361: Input Parameters:
1362: + comm - MPI communicator, set to PETSC_COMM_SELF
1363: . m - number of rows
1364: . n - number of columns
1365: . nz - number of nonzeros per row (same for all rows)
1366: - nnz - array containing the number of nonzeros in the various rows
1367: (possibly different for each row) or NULL
1369: Output Parameter:
1370: . A - the matrix
1372: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1373: MatXXXXSetPreallocation() paradigm instead of this routine directly.
1374: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1376: Notes:
1377: If nnz is given then nz is ignored
1379: The AIJ format (also called the Yale sparse matrix format or
1380: compressed row storage), is fully compatible with standard Fortran 77
1381: storage. That is, the stored row and column indices can begin at
1382: either one (as in Fortran) or zero. See the users' manual for details.
1384: Specify the preallocated storage with either nz or nnz (not both).
1385: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1386: allocation. For large problems you MUST preallocate memory or you
1387: will get TERRIBLE performance, see the users' manual chapter on matrices.
1389: By default, this format uses inodes (identical nodes) when possible, to
1390: improve numerical efficiency of matrix-vector products and solves. We
1391: search for consecutive rows with the same nonzero structure, thereby
1392: reusing matrix information to achieve increased efficiency.
1394: Level: intermediate
1396: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
1397: @*/
1398: PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1399: {
1400: PetscMPIInt size;
1402: MatCreate(comm,A);
1403: MatSetSizes(*A,m,n,M,N);
1404: MPI_Comm_size(comm,&size);
1405: if (size > 1) {
1406: MatSetType(*A,MATMPIAIJKOKKOS);
1407: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
1408: } else {
1409: MatSetType(*A,MATSEQAIJKOKKOS);
1410: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
1411: }
1412: return 0;
1413: }
1415: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1416: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1417: {
1418: PetscMPIInt size,rank;
1419: MPI_Comm comm;
1420: PetscSplitCSRDataStructure d_mat=NULL;
1422: PetscObjectGetComm((PetscObject)A,&comm);
1423: MPI_Comm_size(comm,&size);
1424: MPI_Comm_rank(comm,&rank);
1425: if (size == 1) {
1426: MatSeqAIJKokkosGetDeviceMat(A,&d_mat);
1427: MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1428: } else {
1429: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
1430: MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);
1431: MatSeqAIJKokkosModifyDevice(aij->A);
1432: MatSeqAIJKokkosModifyDevice(aij->B);
1434: }
1435: // act like MatSetValues because not called on host
1436: if (A->assembled) {
1437: if (A->was_assembled) {
1438: PetscInfo(A,"Assemble more than once already\n");
1439: }
1440: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1441: } else {
1442: PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);
1443: }
1444: if (!d_mat) {
1445: struct _n_SplitCSRMat h_mat; /* host container */
1446: Mat_SeqAIJKokkos *aijkokA;
1447: Mat_SeqAIJ *jaca;
1448: PetscInt n = A->rmap->n, nnz;
1449: Mat Amat;
1450: PetscInt *colmap;
1452: /* create and copy h_mat */
1453: h_mat.M = A->cmap->N; // use for debug build
1454: PetscInfo(A,"Create device matrix in Kokkos\n");
1455: if (size == 1) {
1456: Amat = A;
1457: jaca = (Mat_SeqAIJ*)A->data;
1458: h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1459: h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1460: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1461: h_mat.offdiag.a = NULL;
1462: aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1463: } else {
1464: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
1465: Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
1466: PetscInt ii;
1467: Mat_SeqAIJKokkos *aijkokB;
1469: Amat = aij->A;
1470: aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1471: aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1472: jaca = (Mat_SeqAIJ*)aij->A->data;
1475: aij->donotstash = PETSC_TRUE;
1476: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1477: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1478: PetscCalloc1(A->cmap->N,&colmap);
1479: PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));
1480: for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1481: // allocate B copy data
1482: h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1483: h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1484: nnz = jacb->i[n];
1485: if (jacb->compressedrow.use) {
1486: const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
1487: aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1488: Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k);
1489: h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1490: } else {
1491: h_mat.offdiag.i = aijkokB->i_device_data();
1492: }
1493: h_mat.offdiag.j = aijkokB->j_device_data();
1494: h_mat.offdiag.a = aijkokB->a_device_data();
1495: {
1496: Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
1497: aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1498: Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k);
1499: h_mat.colmap = aijkokB->colmap_d.data();
1500: PetscFree(colmap);
1501: }
1502: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1503: h_mat.offdiag.n = n;
1504: }
1505: // allocate A copy data
1506: nnz = jaca->i[n];
1507: h_mat.diag.n = n;
1508: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1509: MPI_Comm_rank(comm,&h_mat.rank);
1511: else {
1512: h_mat.diag.i = aijkokA->i_device_data();
1513: }
1514: h_mat.diag.j = aijkokA->j_device_data();
1515: h_mat.diag.a = aijkokA->a_device_data();
1516: // copy pointers and metdata to device
1517: MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);
1518: MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);
1519: PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);
1520: }
1521: *B = d_mat; // return it, set it in Mat, and set it up
1522: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1523: return 0;
1524: }
1526: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1527: {
1528: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1530: if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1531: else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1532: else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1533: else *mask = "PETSC_OFFLOAD_BOTH";
1534: return 0;
1535: }
1537: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1538: {
1539: PetscMPIInt size;
1540: Mat Ad,Ao;
1541: const char *amask,*bmask;
1543: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1545: if (size == 1) {
1546: MatSeqAIJKokkosGetOffloadMask(A,&amask);
1547: PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);
1548: } else {
1549: Ad = ((Mat_MPIAIJ*)A->data)->A;
1550: Ao = ((Mat_MPIAIJ*)A->data)->B;
1551: MatSeqAIJKokkosGetOffloadMask(Ad,&amask);
1552: MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);
1553: PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);
1554: }
1555: return 0;
1556: }