petsc-3.10.5 2019-03-28
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12:  #include <petsccuda.h>
 13: #include <petsc/private/vecimpl.h>          /*I <petscvec.h> I*/
 14:  #include <../src/vec/vec/impls/dvecimpl.h>
 15:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 17: static PetscErrorCode PetscCUBLASDestroyHandle();

 19: /*
 20:    Implementation for obtaining read-write access to the cuBLAS handle.
 21:    Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
 22:  */
 23: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
 24: {
 25:   static cublasHandle_t cublasv2handle = NULL;
 26:   cublasStatus_t        cberr;
 27:   PetscErrorCode        ierr;

 30:   if (!cublasv2handle) {
 31:     cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
 32:     /* Make sure that the handle will be destroyed properly */
 33:     PetscRegisterFinalize(PetscCUBLASDestroyHandle);
 34:   }
 35:   *handle = &cublasv2handle;
 36:   return(0);
 37: }

 39: /*
 40:    Singleton for obtaining a handle to cuBLAS.
 41:    The handle is required for calls to routines in cuBLAS.
 42:  */
 43: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
 44: {
 45:   cublasHandle_t *p_cublasv2handle;

 49:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 50:   *handle = *p_cublasv2handle;
 51:   return(0);
 52: }


 55: /*
 56:    Destroys the CUBLAS handle.
 57:    This function is intended and registered for PetscFinalize - do not call manually!
 58:  */
 59: PetscErrorCode PetscCUBLASDestroyHandle()
 60: {
 61:   cublasHandle_t *p_cublasv2handle;
 62:   cublasStatus_t cberr;

 66:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 67:   cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
 68:   *p_cublasv2handle = NULL;  /* Ensures proper reinitialization */
 69:   return(0);
 70: }

 72: /*
 73:     Allocates space for the vector array on the Host if it does not exist.
 74:     Does NOT change the PetscCUDAFlag for the vector
 75:     Does NOT zero the CUDA array
 76:  */
 77: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 78: {
 80:   PetscScalar    *array;
 81:   Vec_Seq        *s = (Vec_Seq*)v->data;
 82:   PetscInt       n = v->map->n;

 85:   if (!s) {
 86:     PetscNewLog((PetscObject)v,&s);
 87:     v->data = s;
 88:   }
 89:   if (!s->array) {
 90:     PetscMalloc1(n,&array);
 91:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 92:     s->array           = array;
 93:     s->array_allocated = array;
 94:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 95:       v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 96:     }
 97:   }
 98:   return(0);
 99: }

101: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
102: {
103:   PetscScalar       *ya;
104:   const PetscScalar *xa;
105:   PetscErrorCode    ierr;

108:   VecCUDAAllocateCheckHost(xin);
109:   VecCUDAAllocateCheckHost(yin);
110:   if (xin != yin) {
111:     VecGetArrayRead(xin,&xa);
112:     VecGetArray(yin,&ya);
113:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
114:     VecRestoreArrayRead(xin,&xa);
115:     VecRestoreArray(yin,&ya);
116:   }
117:   return(0);
118: }

120: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
121: {
123:   PetscInt       n = xin->map->n,i;
124:   PetscScalar    *xx;

127:   VecGetArray(xin,&xx);
128:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
129:   VecRestoreArray(xin,&xx);
130:   return(0);
131: }

133: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
134: {
135:   Vec_Seq        *vs = (Vec_Seq*)v->data;

139:   PetscObjectSAWsViewOff(v);
140: #if defined(PETSC_USE_LOG)
141:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
142: #endif
143:   if (vs) {
144:     if (vs->array_allocated) { PetscFree(vs->array_allocated); }
145:     PetscFree(vs);
146:   }
147:   return(0);
148: }

150: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
151: {
152:   Vec_Seq *v = (Vec_Seq*)vin->data;

155:   v->array         = v->unplacedarray;
156:   v->unplacedarray = 0;
157:   return(0);
158: }

160: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
161: {

165:   VecCUDAAllocateCheck(v);
166:   return(0);
167: }

169: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
170: {

174:   VecCUDACopyToGPU(v);
175:   return(0);
176: }

178: /*
179:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

181:    Input Parameters:
182:  +  v    - the vector
183:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
184:  -  mode - vec scatter mode used in VecScatterBegin/End
185: */
186: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
187: {

191:   VecCUDACopyToGPUSome(v,ci,mode);
192:   return(0);
193: }

195: /*
196:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

198:   Input Parameters:
199:  +  v    - the vector
200:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
201:  -  mode - vec scatter mode used in VecScatterBegin/End
202: */
203: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
204: {

208:   VecCUDACopyFromGPUSome(v,ci,mode);
209:   return(0);
210: }

212: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
213: {

217:   VecSetRandom_SeqCUDA_Private(xin,r);
218:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
219:   return(0);
220: }

222: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
223: {

227:   VecCUDACopyFromGPU(vin);
228:   VecResetArray_SeqCUDA_Private(vin);
229:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
230:   return(0);
231: }

233: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
234: {

238:   VecCUDACopyFromGPU(vin);
239:   VecPlaceArray_Seq(vin,a);
240:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
241:   return(0);
242: }

244: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
245: {

249:   VecCUDACopyFromGPU(vin);
250:   VecReplaceArray_Seq(vin,a);
251:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
252:   return(0);
253: }

255: /*@
256:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

258:  Collective on MPI_Comm

260:  Input Parameter:
261:  +  comm - the communicator, should be PETSC_COMM_SELF
262:  -  n - the vector length

264:  Output Parameter:
265:  .  v - the vector

267:  Notes:
268:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
269:  same type as an existing vector.

271:  Level: intermediate

273:  Concepts: vectors^creating sequential

275:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
276:  @*/
277: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
278: {

282:   VecCreate(comm,v);
283:   VecSetSizes(*v,n,n);
284:   VecSetType(*v,VECSEQCUDA);
285:   return(0);
286: }

288: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
289: {

293:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
294:   PetscLayoutReference(win->map,&(*V)->map);
295:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
296:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
297:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
298:   return(0);
299: }

301: PetscErrorCode VecCreate_SeqCUDA(Vec V)
302: {

306:   PetscLayoutSetUp(V->map);
307:   VecCUDAAllocateCheck(V);
308:   V->valid_GPU_array = PETSC_OFFLOAD_GPU;
309:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
310:   VecSet(V,0.0);
311:   return(0);
312: }

314: /*@C
315:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
316:    where the user provides the array space to store the vector values. The array
317:    provided must be a GPU array.

319:    Collective on MPI_Comm

321:    Input Parameter:
322: +  comm - the communicator, should be PETSC_COMM_SELF
323: .  bs - the block size
324: .  n - the vector length
325: -  array - GPU memory where the vector elements are to be stored.

327:    Output Parameter:
328: .  V - the vector

330:    Notes:
331:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
332:    same type as an existing vector.

334:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
335:    at a later stage to SET the array for storing the vector values.

337:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
338:    The user should not free the array until the vector is destroyed.

340:    Level: intermediate

342:    Concepts: vectors^creating with array

344: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
345:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
346:           VecCreateMPIWithArray()
347: @*/
348: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
349: {
351:   PetscMPIInt    size;

354:   VecCreate(comm,V);
355:   VecSetSizes(*V,n,n);
356:   VecSetBlockSize(*V,bs);
357:   MPI_Comm_size(comm,&size);
358:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
359:   VecCreate_SeqCUDA_Private(*V,array);
360:   return(0);
361: }

363: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
364: {
366:   cudaError_t    err;
367:   Vec_CUDA       *veccuda;
368:   PetscMPIInt    size;

371:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
372:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
373:   VecCreate_Seq_Private(V,0);
374:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);

376:   V->ops->dot                    = VecDot_SeqCUDA;
377:   V->ops->norm                   = VecNorm_SeqCUDA;
378:   V->ops->tdot                   = VecTDot_SeqCUDA;
379:   V->ops->scale                  = VecScale_SeqCUDA;
380:   V->ops->copy                   = VecCopy_SeqCUDA;
381:   V->ops->set                    = VecSet_SeqCUDA;
382:   V->ops->swap                   = VecSwap_SeqCUDA;
383:   V->ops->axpy                   = VecAXPY_SeqCUDA;
384:   V->ops->axpby                  = VecAXPBY_SeqCUDA;
385:   V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
386:   V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
387:   V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
388:   V->ops->setrandom              = VecSetRandom_SeqCUDA;
389:   V->ops->dot_local              = VecDot_SeqCUDA;
390:   V->ops->tdot_local             = VecTDot_SeqCUDA;
391:   V->ops->norm_local             = VecNorm_SeqCUDA;
392:   V->ops->mdot_local             = VecMDot_SeqCUDA;
393:   V->ops->maxpy                  = VecMAXPY_SeqCUDA;
394:   V->ops->mdot                   = VecMDot_SeqCUDA;
395:   V->ops->aypx                   = VecAYPX_SeqCUDA;
396:   V->ops->waxpy                  = VecWAXPY_SeqCUDA;
397:   V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
398:   V->ops->placearray             = VecPlaceArray_SeqCUDA;
399:   V->ops->replacearray           = VecReplaceArray_SeqCUDA;
400:   V->ops->resetarray             = VecResetArray_SeqCUDA;
401:   V->ops->destroy                = VecDestroy_SeqCUDA;
402:   V->ops->duplicate              = VecDuplicate_SeqCUDA;
403:   V->ops->conjugate              = VecConjugate_SeqCUDA;
404:   V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
405:   V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
406:   V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
407:   V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;

409:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
410:   if (array) {
411:     if (!V->spptr) {
412:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
413:       veccuda = (Vec_CUDA*)V->spptr;
414:       err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
415:       veccuda->GPUarray_allocated = 0;
416:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
417:       V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
418:     }
419:     veccuda = (Vec_CUDA*)V->spptr;
420:     veccuda->GPUarray = (PetscScalar*)array;
421:   }

423:   return(0);
424: }