Actual source code: vscatcusp.c
petsc-3.4.5 2014-06-29
  1: #include <petsc-private/isimpl.h>
  2: #include <petsc-private/vecimpl.h>             /*I "petscvec.h" I*/

  6: /*@
  7:    VecScatterInitializeForGPU - Initializes a generalized scatter from one vector to
  8:  another for GPU based computation.  Effectively, this function creates all the
  9:  necessary indexing buffers and work vectors needed to move data only those data points
 10:  in a vector which need to be communicated across ranks. This is done at the first time
 11:  this function is called. Thereafter, this function launches a kernel,
 12:  VecCUSPCopySomeToContiguousBufferGPU_Public, which moves the scattered data into a
 13:  contiguous buffer on the GPU. Currently, this only used in the context of the parallel
 14:  SpMV call in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu) or MatMult_MPIAIJCUSPARSE
 15:  (in mpi/mpicusparse/mpiaijcusparse.cu). This function is executed before the call to
 16:  MatMult. This enables the memory transfers to be overlapped with the MatMult SpMV kernel
 17:  call.

 19:    Input Parameters:
 20: +  inctx - scatter context generated by VecScatterCreate()
 21: .  x - the vector from which we scatter
 22: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
 23:     SCATTER_FORWARD or SCATTER_REVERSE

 25:   Level: intermediate

 27: .seealso: VecScatterCreate(), VecScatterEnd()
 28: @*/
 29: PetscErrorCode  VecScatterInitializeForGPU(VecScatter inctx,Vec x,ScatterMode mode)
 30: {
 31:   VecScatter_MPI_General *to,*from;
 32:   PetscScalar            *xv;
 33:   PetscErrorCode         ierr;
 34:   PetscInt               i,*indices,*sstartsSends,*sstartsRecvs,nrecvs,nsends,bs;

 37:   if (mode & SCATTER_REVERSE) {
 38:     to     = (VecScatter_MPI_General*)inctx->fromdata;
 39:     from   = (VecScatter_MPI_General*)inctx->todata;
 40:   } else {
 41:     to     = (VecScatter_MPI_General*)inctx->todata;
 42:     from   = (VecScatter_MPI_General*)inctx->fromdata;
 43:   }
 44:   bs           = to->bs;
 45:   nrecvs       = from->n;
 46:   nsends       = to->n;
 47:   indices      = to->indices;
 48:   sstartsSends = to->starts;
 49:   sstartsRecvs = from->starts;
 50:   if (x->valid_GPU_array != PETSC_CUSP_UNALLOCATED && (nsends>0 || nrecvs>0)) {
 51:     if (!inctx->spptr) {
 52:       PetscInt k,*tindicesSends,*sindicesSends,*tindicesRecvs,*sindicesRecvs;
 53:       PetscInt ns = sstartsSends[nsends],nr = sstartsRecvs[nrecvs];
 54:       /* Here we create indices for both the senders and receivers. */
 55:       PetscMalloc(ns*sizeof(PetscInt),&tindicesSends);
 56:       PetscMalloc(nr*sizeof(PetscInt),&tindicesRecvs);

 58:       PetscMemcpy(tindicesSends,indices,ns*sizeof(PetscInt));
 59:       PetscMemcpy(tindicesRecvs,from->indices,nr*sizeof(PetscInt));

 61:       PetscSortRemoveDupsInt(&ns,tindicesSends);
 62:       PetscSortRemoveDupsInt(&nr,tindicesRecvs);

 64:       PetscMalloc(bs*ns*sizeof(PetscInt),&sindicesSends);
 65:       PetscMalloc(from->bs*nr*sizeof(PetscInt),&sindicesRecvs);

 67:       /* sender indices */
 68:       for (i=0; i<ns; i++) {
 69:         for (k=0; k<bs; k++) sindicesSends[i*bs+k] = tindicesSends[i]+k;
 70:       }
 71:       PetscFree(tindicesSends);

 73:       /* receiver indices */
 74:       for (i=0; i<nr; i++) {
 75:         for (k=0; k<from->bs; k++) sindicesRecvs[i*from->bs+k] = tindicesRecvs[i]+k;
 76:       }
 77:       PetscFree(tindicesRecvs);

 79:       /* create GPU indices, work vectors, ... */
 80:       PetscCUSPIndicesCreate(ns*bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);
 81:       PetscFree(sindicesSends);
 82:       PetscFree(sindicesRecvs);
 83:     }
 84:     /*
 85:      This should be called here.
 86:      ... basically, we launch the copy kernel that takes the scattered data and puts it in a
 87:          a contiguous buffer. Then, this buffer is messaged after the MatMult is called.
 88:      */
 89: #if 0 /* Paul, why did you leave this line commented after writing the note above explaining why it should be called? */
 90:     /* I couldn't make this version run more efficiently. In theory, I would like to do it this way
 91:        since the amount of data transfer between GPU and CPU is reduced. However, gather kernels
 92:        really don't perform very well on the device. Thus, what I do is message (from GPU to CPU) the
 93:        smallest contiguous chunk of the vector containing all those elements needing to be MPI-messaged.
 94:        I would like to leave this code in here for now ... maybe I'll figure out how to do a better
 95:        gather kernel on GPU. */
 96:     VecCUSPCopySomeToContiguousBufferGPU_Public(x,(PetscCUSPIndices)inctx->spptr);
 97: #endif
 98:     } else {
 99:     VecGetArrayRead(x,(const PetscScalar**)&xv);
100:   }
101:   return(0);
102: }
105: /*@
106:    VecScatterFinalizeForGPU - Finalizes a generalized scatter from one vector to
107:  another for GPU based computation. Effectively, this function resets the temporary
108:  buffer flags. Currently, this only used in the context of the parallel SpMV call in
109:  in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu) or MatMult_MPIAIJCUSPARSE
110:  (in mpi/mpicusparse/mpiaijcusparse.cu). Once the MatMultAdd is finished,
111:  the GPU temporary buffers used for messaging are no longer valid.

113:    Input Parameters:
114: +  inctx - scatter context generated by VecScatterCreate()

116:   Level: intermediate

118: @*/
119: PetscErrorCode  VecScatterFinalizeForGPU(VecScatter inctx)
120: {
122:   if (inctx->spptr) {
124:     VecCUSPResetIndexBuffersFlagsGPU_Public((PetscCUSPIndices)inctx->spptr);
125:   }
126:   return(0);
127: }