petsc-3.14.6 2021-03-30
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12: #include <petsc/private/vecimpl.h>
 13: #include <../src/vec/vec/impls/dvecimpl.h>
 14: #include <petsc/private/cudavecimpl.h>

 16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
 17: {
 20:   if (x) {
 21:     Vec_Seq *h = (Vec_Seq*)v->data;

 23:     *x = h->array;
 24:   }
 25:   if (x_d) {
 26:     Vec_CUDA *d = (Vec_CUDA*)v->spptr;

 28:     *x_d = d ? d->GPUarray : NULL;
 29:   }
 30:   if (flg) *flg = v->offloadmask;
 31:   return(0);
 32: }

 34: /*
 35:     Allocates space for the vector array on the Host if it does not exist.
 36:     Does NOT change the PetscCUDAFlag for the vector
 37:     Does NOT zero the CUDA array
 38:  */
 39: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 40: {
 42:   PetscScalar    *array;
 43:   Vec_Seq        *s = (Vec_Seq*)v->data;
 44:   PetscInt       n = v->map->n;

 47:   if (!s) {
 48:     PetscNewLog((PetscObject)v,&s);
 49:     v->data = s;
 50:   }
 51:   if (!s->array) {
 52:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 53:       PetscMallocSetCUDAHost();
 54:       v->pinned_memory = PETSC_TRUE;
 55:     }
 56:     PetscMalloc1(n,&array);
 57:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 58:     s->array           = array;
 59:     s->array_allocated = array;
 60:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 61:       PetscMallocResetCUDAHost();
 62:     }
 63:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 64:       v->offloadmask = PETSC_OFFLOAD_CPU;
 65:     }
 66:   }
 67:   return(0);
 68: }

 70: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
 71: {
 72:   PetscScalar       *ya;
 73:   const PetscScalar *xa;
 74:   PetscErrorCode    ierr;

 77:   VecCUDAAllocateCheckHost(xin);
 78:   VecCUDAAllocateCheckHost(yin);
 79:   if (xin != yin) {
 80:     VecGetArrayRead(xin,&xa);
 81:     VecGetArray(yin,&ya);
 82:     PetscArraycpy(ya,xa,xin->map->n);
 83:     VecRestoreArrayRead(xin,&xa);
 84:     VecRestoreArray(yin,&ya);
 85:   }
 86:   return(0);
 87: }

 89: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
 90: {
 92:   PetscInt       n = xin->map->n,i;
 93:   PetscScalar    *xx;

 96:   VecGetArray(xin,&xx);
 97:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
 98:   VecRestoreArray(xin,&xx);
 99:   return(0);
100: }

102: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
103: {
104:   Vec_Seq        *vs = (Vec_Seq*)v->data;

108:   PetscObjectSAWsViewOff(v);
109: #if defined(PETSC_USE_LOG)
110:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
111: #endif
112:   if (vs) {
113:     if (vs->array_allocated) {
114:       if (v->pinned_memory) {
115:         PetscMallocSetCUDAHost();
116:       }
117:       PetscFree(vs->array_allocated);
118:       if (v->pinned_memory) {
119:         PetscMallocResetCUDAHost();
120:         v->pinned_memory = PETSC_FALSE;
121:       }
122:     }
123:     PetscFree(vs);
124:   }
125:   return(0);
126: }

128: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
129: {
130:   Vec_Seq *v = (Vec_Seq*)vin->data;

133:   v->array         = v->unplacedarray;
134:   v->unplacedarray = 0;
135:   return(0);
136: }

138: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
139: {

143:   VecCUDAAllocateCheck(v);
144:   return(0);
145: }

147: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
148: {

152:   VecCUDACopyToGPU(v);
153:   return(0);
154: }

156: /*
157:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

159:    Input Parameters:
160:  +  v    - the vector
161:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
162:  -  mode - vec scatter mode used in VecScatterBegin/End
163: */
164: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
165: {

169:   VecCUDACopyToGPUSome(v,ci,mode);
170:   return(0);
171: }

173: /*
174:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

176:   Input Parameters:
177:  +  v    - the vector
178:  .  ci   - the requested indices, this should be created with CUDAIndicesCreate()
179:  -  mode - vec scatter mode used in VecScatterBegin/End
180: */
181: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
182: {

186:   VecCUDACopyFromGPUSome(v,ci,mode);
187:   return(0);
188: }

190: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
191: {

195:   VecSetRandom_SeqCUDA_Private(xin,r);
196:   xin->offloadmask = PETSC_OFFLOAD_CPU;
197:   return(0);
198: }

200: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
201: {

205:   VecCUDACopyFromGPU(vin);
206:   VecResetArray_SeqCUDA_Private(vin);
207:   vin->offloadmask = PETSC_OFFLOAD_CPU;
208:   return(0);
209: }

211: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
212: {

216:   VecCUDACopyFromGPU(vin);
217:   VecPlaceArray_Seq(vin,a);
218:   vin->offloadmask = PETSC_OFFLOAD_CPU;
219:   return(0);
220: }

222: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
223: {
225:   Vec_Seq        *vs = (Vec_Seq*)vin->data;

228:   if (vs->array != vs->array_allocated) {
229:     /* make sure the users array has the latest values */
230:     VecCUDACopyFromGPU(vin);
231:   }
232:   if (vs->array_allocated) {
233:     if (vin->pinned_memory) {
234:       PetscMallocSetCUDAHost();
235:     }
236:     PetscFree(vs->array_allocated);
237:     if (vin->pinned_memory) {
238:       PetscMallocResetCUDAHost();
239:     }
240:   }
241:   vin->pinned_memory = PETSC_FALSE;
242:   vs->array_allocated = vs->array = (PetscScalar*)a;
243:   vin->offloadmask = PETSC_OFFLOAD_CPU;
244:   return(0);
245: }

247: /*@
248:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

250:  Collective

252:  Input Parameter:
253:  +  comm - the communicator, should be PETSC_COMM_SELF
254:  -  n - the vector length

256:  Output Parameter:
257:  .  v - the vector

259:  Notes:
260:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
261:  same type as an existing vector.

263:  Level: intermediate

265:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
266:  @*/
267: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
268: {

272:   VecCreate(comm,v);
273:   VecSetSizes(*v,n,n);
274:   VecSetType(*v,VECSEQCUDA);
275:   return(0);
276: }

278: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
279: {

283:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
284:   PetscLayoutReference(win->map,&(*V)->map);
285:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
286:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
287:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
288:   return(0);
289: }

291: PetscErrorCode VecCreate_SeqCUDA(Vec V)
292: {

296:   PetscCUDAInitializeCheck();
297:   PetscLayoutSetUp(V->map);
298:   VecCUDAAllocateCheck(V);
299:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
300:   VecCUDAAllocateCheckHost(V);
301:   VecSet(V,0.0);
302:   VecSet_Seq(V,0.0);
303:   V->offloadmask = PETSC_OFFLOAD_BOTH;
304:   return(0);
305: }

307: /*@C
308:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
309:    where the user provides the array space to store the vector values. The array
310:    provided must be a GPU array.

312:    Collective

314:    Input Parameter:
315: +  comm - the communicator, should be PETSC_COMM_SELF
316: .  bs - the block size
317: .  n - the vector length
318: -  array - GPU memory where the vector elements are to be stored.

320:    Output Parameter:
321: .  V - the vector

323:    Notes:
324:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
325:    same type as an existing vector.

327:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
328:    at a later stage to SET the array for storing the vector values.

330:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
331:    The user should not free the array until the vector is destroyed.

333:    Level: intermediate

335: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
336:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
337:           VecCreateMPIWithArray()
338: @*/
339: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
340: {

344:   PetscCUDAInitializeCheck();
345:   VecCreate(comm,V);
346:   VecSetSizes(*V,n,n);
347:   VecSetBlockSize(*V,bs);
348:   VecCreate_SeqCUDA_Private(*V,array);
349:   return(0);
350: }

352: /*@C
353:    VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
354:    where the user provides the array space to store the vector values.

356:    Collective

358:    Input Parameter:
359: +  comm - the communicator, should be PETSC_COMM_SELF
360: .  bs - the block size
361: .  n - the vector length
362: -  cpuarray - CPU memory where the vector elements are to be stored.
363: -  gpuarray - GPU memory where the vector elements are to be stored.

365:    Output Parameter:
366: .  V - the vector

368:    Notes:
369:    If both cpuarray and gpuarray are provided, the caller must ensure that
370:    the provided arrays have identical values.

372:    PETSc does NOT free the provided arrays when the vector is destroyed via
373:    VecDestroy(). The user should not free the array until the vector is
374:    destroyed.

376:    Level: intermediate

378: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
379:           VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
380:           VecCUDAAllocateCheckHost()
381: @*/
382: PetscErrorCode  VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
383: {

387:   // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
388:   VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);

390:   if (cpuarray && gpuarray) {
391:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
392:     s->array = (PetscScalar*)cpuarray;
393:     (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
394:   } else if (cpuarray) {
395:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
396:     s->array = (PetscScalar*)cpuarray;
397:     (*V)->offloadmask = PETSC_OFFLOAD_CPU;
398:   } else if (gpuarray) {
399:     (*V)->offloadmask = PETSC_OFFLOAD_GPU;
400:   } else {
401:     (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
402:   }

404:   return(0);
405: }

407: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
408: {

412:   VecCUDAAllocateCheckHost(v);
413:   v->offloadmask = PETSC_OFFLOAD_CPU;
414:   *vv = *((PetscScalar**)v->data);
415:   return(0);
416: }

418: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
419: {

423:   V->boundtocpu = pin;
424:   if (pin) {
425:     VecCUDACopyFromGPU(V);
426:     V->offloadmask                 = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
427:     V->ops->dot                    = VecDot_Seq;
428:     V->ops->norm                   = VecNorm_Seq;
429:     V->ops->tdot                   = VecTDot_Seq;
430:     V->ops->scale                  = VecScale_Seq;
431:     V->ops->copy                   = VecCopy_Seq;
432:     V->ops->set                    = VecSet_Seq;
433:     V->ops->swap                   = VecSwap_Seq;
434:     V->ops->axpy                   = VecAXPY_Seq;
435:     V->ops->axpby                  = VecAXPBY_Seq;
436:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
437:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
438:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
439:     V->ops->setrandom              = VecSetRandom_Seq;
440:     V->ops->dot_local              = VecDot_Seq;
441:     V->ops->tdot_local             = VecTDot_Seq;
442:     V->ops->norm_local             = VecNorm_Seq;
443:     V->ops->mdot_local             = VecMDot_Seq;
444:     V->ops->mtdot_local            = VecMTDot_Seq;
445:     V->ops->maxpy                  = VecMAXPY_Seq;
446:     V->ops->mdot                   = VecMDot_Seq;
447:     V->ops->mtdot                  = VecMTDot_Seq;
448:     V->ops->aypx                   = VecAYPX_Seq;
449:     V->ops->waxpy                  = VecWAXPY_Seq;
450:     V->ops->dotnorm2               = NULL;
451:     V->ops->placearray             = VecPlaceArray_Seq;
452:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
453:     V->ops->resetarray             = VecResetArray_Seq;
454:     V->ops->duplicate              = VecDuplicate_Seq;
455:     V->ops->conjugate              = VecConjugate_Seq;
456:     V->ops->getlocalvector         = NULL;
457:     V->ops->restorelocalvector     = NULL;
458:     V->ops->getlocalvectorread     = NULL;
459:     V->ops->restorelocalvectorread = NULL;
460:     V->ops->getarraywrite          = NULL;
461:   } else {
462:     V->ops->dot                    = VecDot_SeqCUDA;
463:     V->ops->norm                   = VecNorm_SeqCUDA;
464:     V->ops->tdot                   = VecTDot_SeqCUDA;
465:     V->ops->scale                  = VecScale_SeqCUDA;
466:     V->ops->copy                   = VecCopy_SeqCUDA;
467:     V->ops->set                    = VecSet_SeqCUDA;
468:     V->ops->swap                   = VecSwap_SeqCUDA;
469:     V->ops->axpy                   = VecAXPY_SeqCUDA;
470:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
471:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
472:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
473:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
474:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
475:     V->ops->dot_local              = VecDot_SeqCUDA;
476:     V->ops->tdot_local             = VecTDot_SeqCUDA;
477:     V->ops->norm_local             = VecNorm_SeqCUDA;
478:     V->ops->mdot_local             = VecMDot_SeqCUDA;
479:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
480:     V->ops->mdot                   = VecMDot_SeqCUDA;
481:     V->ops->aypx                   = VecAYPX_SeqCUDA;
482:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
483:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
484:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
485:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
486:     V->ops->resetarray             = VecResetArray_SeqCUDA;
487:     V->ops->destroy                = VecDestroy_SeqCUDA;
488:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
489:     V->ops->conjugate              = VecConjugate_SeqCUDA;
490:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
491:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
492:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
493:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
494:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
495:   }
496:   return(0);
497: }

499: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
500: {
502:   Vec_CUDA       *veccuda;
503:   PetscMPIInt    size;
504:   PetscBool      option_set;

507:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
508:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
509:   VecCreate_Seq_Private(V,0);
510:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
511:   VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
512:   V->ops->bindtocpu = VecBindToCPU_SeqCUDA;

514:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
515:   if (array) {
516:     if (!V->spptr) {
517:       PetscReal pinned_memory_min;
518:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
519:       veccuda = (Vec_CUDA*)V->spptr;
520:       veccuda->stream = 0; /* using default stream */
521:       veccuda->GPUarray_allocated = 0;
522:       V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

524:       pinned_memory_min = 0;
525:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
526:          Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
527:       PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
528:       PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
529:       if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
530:       PetscOptionsEnd();
531:     }
532:     veccuda = (Vec_CUDA*)V->spptr;
533:     veccuda->GPUarray = (PetscScalar*)array;
534:     V->offloadmask = PETSC_OFFLOAD_GPU;

536:   }
537:   return(0);
538: }