petsc-3.13.6 2020-09-29
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12:  #include <petsc/private/vecimpl.h>
 13:  #include <../src/vec/vec/impls/dvecimpl.h>
 14:  #include <petsc/private/cudavecimpl.h>

 16: /*
 17:     Allocates space for the vector array on the Host if it does not exist.
 18:     Does NOT change the PetscCUDAFlag for the vector
 19:     Does NOT zero the CUDA array
 20:  */
 21: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 22: {
 24:   PetscScalar    *array;
 25:   Vec_Seq        *s = (Vec_Seq*)v->data;
 26:   PetscInt       n = v->map->n;

 29:   if (!s) {
 30:     PetscNewLog((PetscObject)v,&s);
 31:     v->data = s;
 32:   }
 33:   if (!s->array) {
 34:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 35:       PetscMallocSetCUDAHost();
 36:       v->pinned_memory = PETSC_TRUE;
 37:     }
 38:     PetscMalloc1(n,&array);
 39:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 40:     s->array           = array;
 41:     s->array_allocated = array;
 42:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 43:       PetscMallocResetCUDAHost();
 44:     }
 45:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 46:       v->offloadmask = PETSC_OFFLOAD_CPU;
 47:     }
 48:   }
 49:   return(0);
 50: }

 52: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
 53: {
 54:   PetscScalar       *ya;
 55:   const PetscScalar *xa;
 56:   PetscErrorCode    ierr;

 59:   VecCUDAAllocateCheckHost(xin);
 60:   VecCUDAAllocateCheckHost(yin);
 61:   if (xin != yin) {
 62:     VecGetArrayRead(xin,&xa);
 63:     VecGetArray(yin,&ya);
 64:     PetscArraycpy(ya,xa,xin->map->n);
 65:     VecRestoreArrayRead(xin,&xa);
 66:     VecRestoreArray(yin,&ya);
 67:   }
 68:   return(0);
 69: }

 71: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
 72: {
 74:   PetscInt       n = xin->map->n,i;
 75:   PetscScalar    *xx;

 78:   VecGetArray(xin,&xx);
 79:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
 80:   VecRestoreArray(xin,&xx);
 81:   return(0);
 82: }

 84: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
 85: {
 86:   Vec_Seq        *vs = (Vec_Seq*)v->data;

 90:   PetscObjectSAWsViewOff(v);
 91: #if defined(PETSC_USE_LOG)
 92:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
 93: #endif
 94:   if (vs) {
 95:     if (vs->array_allocated) {
 96:       if (v->pinned_memory) {
 97:         PetscMallocSetCUDAHost();
 98:       }
 99:       PetscFree(vs->array_allocated);
100:       if (v->pinned_memory) {
101:         PetscMallocResetCUDAHost();
102:         v->pinned_memory = PETSC_FALSE;
103:       }
104:     }
105:     PetscFree(vs);
106:   }
107:   return(0);
108: }

110: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
111: {
112:   Vec_Seq *v = (Vec_Seq*)vin->data;

115:   v->array         = v->unplacedarray;
116:   v->unplacedarray = 0;
117:   return(0);
118: }

120: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
121: {

125:   VecCUDAAllocateCheck(v);
126:   return(0);
127: }

129: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
130: {

134:   VecCUDACopyToGPU(v);
135:   return(0);
136: }

138: /*
139:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

141:    Input Parameters:
142:  +  v    - the vector
143:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
144:  -  mode - vec scatter mode used in VecScatterBegin/End
145: */
146: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
147: {

151:   VecCUDACopyToGPUSome(v,ci,mode);
152:   return(0);
153: }

155: /*
156:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

158:   Input Parameters:
159:  +  v    - the vector
160:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
161:  -  mode - vec scatter mode used in VecScatterBegin/End
162: */
163: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
164: {

168:   VecCUDACopyFromGPUSome(v,ci,mode);
169:   return(0);
170: }

172: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
173: {

177:   VecSetRandom_SeqCUDA_Private(xin,r);
178:   xin->offloadmask = PETSC_OFFLOAD_CPU;
179:   return(0);
180: }

182: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
183: {

187:   VecCUDACopyFromGPU(vin);
188:   VecResetArray_SeqCUDA_Private(vin);
189:   vin->offloadmask = PETSC_OFFLOAD_CPU;
190:   return(0);
191: }

193: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
194: {

198:   VecCUDACopyFromGPU(vin);
199:   VecPlaceArray_Seq(vin,a);
200:   vin->offloadmask = PETSC_OFFLOAD_CPU;
201:   return(0);
202: }

204: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
205: {

209:   VecCUDACopyFromGPU(vin);
210:   VecReplaceArray_Seq(vin,a);
211:   vin->offloadmask = PETSC_OFFLOAD_CPU;
212:   return(0);
213: }

215: /*@
216:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

218:  Collective

220:  Input Parameter:
221:  +  comm - the communicator, should be PETSC_COMM_SELF
222:  -  n - the vector length

224:  Output Parameter:
225:  .  v - the vector

227:  Notes:
228:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
229:  same type as an existing vector.

231:  Level: intermediate

233:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
234:  @*/
235: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
236: {

240:   VecCreate(comm,v);
241:   VecSetSizes(*v,n,n);
242:   VecSetType(*v,VECSEQCUDA);
243:   return(0);
244: }

246: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
247: {

251:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
252:   PetscLayoutReference(win->map,&(*V)->map);
253:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
254:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
255:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
256:   return(0);
257: }

259: PetscErrorCode VecCreate_SeqCUDA(Vec V)
260: {

264:   PetscLayoutSetUp(V->map);
265:   VecCUDAAllocateCheck(V);
266:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
267:   VecCUDAAllocateCheckHost(V);
268:   VecSet(V,0.0);
269:   VecSet_Seq(V,0.0);
270:   V->offloadmask = PETSC_OFFLOAD_BOTH;
271:   return(0);
272: }

274: /*@C
275:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
276:    where the user provides the array space to store the vector values. The array
277:    provided must be a GPU array.

279:    Collective

281:    Input Parameter:
282: +  comm - the communicator, should be PETSC_COMM_SELF
283: .  bs - the block size
284: .  n - the vector length
285: -  array - GPU memory where the vector elements are to be stored.

287:    Output Parameter:
288: .  V - the vector

290:    Notes:
291:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
292:    same type as an existing vector.

294:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
295:    at a later stage to SET the array for storing the vector values.

297:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
298:    The user should not free the array until the vector is destroyed.

300:    Level: intermediate

302: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
303:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
304:           VecCreateMPIWithArray()
305: @*/
306: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
307: {
309:   PetscMPIInt    size;

312:   VecCreate(comm,V);
313:   VecSetSizes(*V,n,n);
314:   VecSetBlockSize(*V,bs);
315:   MPI_Comm_size(comm,&size);
316:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
317:   VecCreate_SeqCUDA_Private(*V,array);
318:   (*V)->offloadmask = PETSC_OFFLOAD_GPU;
319:   return(0);
320: }

322: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
323: {

327:   VecCUDAAllocateCheckHost(v);
328:   v->offloadmask = PETSC_OFFLOAD_CPU;
329:   *vv = *((PetscScalar**)v->data);
330:   return(0);
331: }

333: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
334: {

338:   V->boundtocpu = pin;
339:   if (pin) {
340:     VecCUDACopyFromGPU(V);
341:     V->offloadmask                 = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
342:     V->ops->dot                    = VecDot_Seq;
343:     V->ops->norm                   = VecNorm_Seq;
344:     V->ops->tdot                   = VecTDot_Seq;
345:     V->ops->scale                  = VecScale_Seq;
346:     V->ops->copy                   = VecCopy_Seq;
347:     V->ops->set                    = VecSet_Seq;
348:     V->ops->swap                   = VecSwap_Seq;
349:     V->ops->axpy                   = VecAXPY_Seq;
350:     V->ops->axpby                  = VecAXPBY_Seq;
351:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
352:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
353:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
354:     V->ops->setrandom              = VecSetRandom_Seq;
355:     V->ops->dot_local              = VecDot_Seq;
356:     V->ops->tdot_local             = VecTDot_Seq;
357:     V->ops->norm_local             = VecNorm_Seq;
358:     V->ops->mdot_local             = VecMDot_Seq;
359:     V->ops->mtdot_local            = VecMTDot_Seq;
360:     V->ops->maxpy                  = VecMAXPY_Seq;
361:     V->ops->mdot                   = VecMDot_Seq;
362:     V->ops->mtdot                  = VecMTDot_Seq;
363:     V->ops->aypx                   = VecAYPX_Seq;
364:     V->ops->waxpy                  = VecWAXPY_Seq;
365:     V->ops->dotnorm2               = NULL;
366:     V->ops->placearray             = VecPlaceArray_Seq;
367:     V->ops->replacearray           = VecReplaceArray_Seq;
368:     V->ops->resetarray             = VecResetArray_Seq;
369:     V->ops->duplicate              = VecDuplicate_Seq;
370:     V->ops->conjugate              = VecConjugate_Seq;
371:     V->ops->getlocalvector         = NULL;
372:     V->ops->restorelocalvector     = NULL;
373:     V->ops->getlocalvectorread     = NULL;
374:     V->ops->restorelocalvectorread = NULL;
375:     V->ops->getarraywrite          = NULL;
376:   } else {
377:     V->ops->dot                    = VecDot_SeqCUDA;
378:     V->ops->norm                   = VecNorm_SeqCUDA;
379:     V->ops->tdot                   = VecTDot_SeqCUDA;
380:     V->ops->scale                  = VecScale_SeqCUDA;
381:     V->ops->copy                   = VecCopy_SeqCUDA;
382:     V->ops->set                    = VecSet_SeqCUDA;
383:     V->ops->swap                   = VecSwap_SeqCUDA;
384:     V->ops->axpy                   = VecAXPY_SeqCUDA;
385:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
386:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
387:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
388:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
389:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
390:     V->ops->dot_local              = VecDot_SeqCUDA;
391:     V->ops->tdot_local             = VecTDot_SeqCUDA;
392:     V->ops->norm_local             = VecNorm_SeqCUDA;
393:     V->ops->mdot_local             = VecMDot_SeqCUDA;
394:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
395:     V->ops->mdot                   = VecMDot_SeqCUDA;
396:     V->ops->aypx                   = VecAYPX_SeqCUDA;
397:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
398:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
399:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
400:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
401:     V->ops->resetarray             = VecResetArray_SeqCUDA;
402:     V->ops->destroy                = VecDestroy_SeqCUDA;
403:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
404:     V->ops->conjugate              = VecConjugate_SeqCUDA;
405:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
406:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
407:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
408:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
409:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
410:   }
411:   return(0);
412: }

414: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
415: {
417:   Vec_CUDA       *veccuda;
418:   PetscMPIInt    size;
419:   PetscBool      option_set;

422:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
423:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
424:   VecCreate_Seq_Private(V,0);
425:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
426:   VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
427:   V->ops->bindtocpu = VecBindToCPU_SeqCUDA;

429:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
430:   if (array) {
431:     if (!V->spptr) {
432:       PetscReal pinned_memory_min;
433:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
434:       veccuda = (Vec_CUDA*)V->spptr;
435:       veccuda->stream = 0; /* using default stream */
436:       veccuda->GPUarray_allocated = 0;
437:       V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

439:       pinned_memory_min = 0;
440:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
441:          Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
442:       PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
443:       PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
444:       if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
445:       PetscOptionsEnd();
446:     }
447:     veccuda = (Vec_CUDA*)V->spptr;
448:     veccuda->GPUarray = (PetscScalar*)array;
449:   }

451:   return(0);
452: }