1: #include <petsc-private/isimpl.h>
2: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
6: /*@
7: VecScatterInitializeForGPU - Initializes a generalized scatter from one vector to
8: another for GPU based computation. Effectively, this function creates all the
9: necessary indexing buffers and work vectors needed to move data only those data points
10: in a vector which need to be communicated across ranks. This is done at the first time
11: this function is called. Thereafter, this function launches a kernel,
12: VecCUSPCopySomeToContiguousBufferGPU_Public, which moves the scattered data into a
13: contiguous buffer on the GPU. Currently, this only used in the context of the parallel
14: SpMV call in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu) or MatMult_MPIAIJCUSPARSE
15: (in mpi/mpicusparse/mpiaijcusparse.cu). This function is executed before the call to
16: MatMult. This enables the memory transfers to be overlapped with the MatMult SpMV kernel
17: call.
19: Input Parameters:
20: + inctx - scatter context generated by VecScatterCreate()
21: . x - the vector from which we scatter
22: - mode - the scattering mode, usually SCATTER_FORWARD. The available modes are:
23: SCATTER_FORWARD or SCATTER_REVERSE 25: Level: intermediate
27: .seealso: VecScatterCreate(), VecScatterEnd()
28: @*/
29: PetscErrorCodeVecScatterInitializeForGPU(VecScatter inctx,Vec x,ScatterMode mode) 30: {
31: VecScatter_MPI_General *to,*from;
32: PetscScalar *xv;
33: PetscErrorCode ierr;
34: PetscInt i,*indices,*sstartsSends,*sstartsRecvs,nrecvs,nsends,bs;
37: if (mode & SCATTER_REVERSE) {
38: to = (VecScatter_MPI_General*)inctx->fromdata;
39: from = (VecScatter_MPI_General*)inctx->todata;
40: } else {
41: to = (VecScatter_MPI_General*)inctx->todata;
42: from = (VecScatter_MPI_General*)inctx->fromdata;
43: }
44: bs = to->bs;
45: nrecvs = from->n;
46: nsends = to->n;
47: indices = to->indices;
48: sstartsSends = to->starts;
49: sstartsRecvs = from->starts;
50: if (x->valid_GPU_array != PETSC_CUSP_UNALLOCATED && (nsends>0 || nrecvs>0)) {
51: if (!inctx->spptr) {
52: PetscInt k,*tindicesSends,*sindicesSends,*tindicesRecvs,*sindicesRecvs;
53: PetscInt ns = sstartsSends[nsends],nr = sstartsRecvs[nrecvs];
54: /* Here we create indices for both the senders and receivers. */
55: PetscMalloc(ns*sizeof(PetscInt),&tindicesSends);
56: PetscMalloc(nr*sizeof(PetscInt),&tindicesRecvs);
58: PetscMemcpy(tindicesSends,indices,ns*sizeof(PetscInt));
59: PetscMemcpy(tindicesRecvs,from->indices,nr*sizeof(PetscInt));
61: PetscSortRemoveDupsInt(&ns,tindicesSends);
62: PetscSortRemoveDupsInt(&nr,tindicesRecvs);
64: PetscMalloc(bs*ns*sizeof(PetscInt),&sindicesSends);
65: PetscMalloc(from->bs*nr*sizeof(PetscInt),&sindicesRecvs);
67: /* sender indices */
68: for (i=0; i<ns; i++) {
69: for (k=0; k<bs; k++) sindicesSends[i*bs+k] = tindicesSends[i]+k;
70: }
71: PetscFree(tindicesSends);
73: /* receiver indices */
74: for (i=0; i<nr; i++) {
75: for (k=0; k<from->bs; k++) sindicesRecvs[i*from->bs+k] = tindicesRecvs[i]+k;
76: }
77: PetscFree(tindicesRecvs);
79: /* create GPU indices, work vectors, ... */
80: PetscCUSPIndicesCreate(ns*bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);
81: PetscFree(sindicesSends);
82: PetscFree(sindicesRecvs);
83: }
84: /*
85: This should be called here.
86: ... basically, we launch the copy kernel that takes the scattered data and puts it in a
87: a contiguous buffer. Then, this buffer is messaged after the MatMult is called.
88: */
89: #if 0 /* Paul, why did you leave this line commented after writing the note above explaining why it should be called? */
90: /* I couldn't make this version run more efficiently. In theory, I would like to do it this way
91: since the amount of data transfer between GPU and CPU is reduced. However, gather kernels
92: really don't perform very well on the device. Thus, what I do is message (from GPU to CPU) the
93: smallest contiguous chunk of the vector containing all those elements needing to be MPI-messaged.
94: I would like to leave this code in here for now ... maybe I'll figure out how to do a better
95: gather kernel on GPU. */
96: VecCUSPCopySomeToContiguousBufferGPU_Public(x,(PetscCUSPIndices)inctx->spptr);
97: #endif
98: } else {
99: VecGetArrayRead(x,(const PetscScalar**)&xv);
100: }
101: return(0);
102: }
105: /*@
106: VecScatterFinalizeForGPU - Finalizes a generalized scatter from one vector to
107: another for GPU based computation. Effectively, this function resets the temporary
108: buffer flags. Currently, this only used in the context of the parallel SpMV call in
109: in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu) or MatMult_MPIAIJCUSPARSE
110: (in mpi/mpicusparse/mpiaijcusparse.cu). Once the MatMultAdd is finished,
111: the GPU temporary buffers used for messaging are no longer valid.
113: Input Parameters:
114: + inctx - scatter context generated by VecScatterCreate()
116: Level: intermediate
118: @*/
119: PetscErrorCodeVecScatterFinalizeForGPU(VecScatter inctx)120: {
122: if (inctx->spptr) {
124: VecCUSPResetIndexBuffersFlagsGPU_Public((PetscCUSPIndices)inctx->spptr);
125: }
126: return(0);
127: }