Actual source code: mpicuda.cu

petsc-3.14.6 2021-03-30
Report Typos and Errors

  2: /*
  3:    This file contains routines for Parallel vector operations.
  4:  */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: /*MC
 13:    VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.

 15:    Options Database Keys:
 16: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()

 18:   Level: beginner

 20: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECSEQCUDA, VECMPICUDA, VECSTANDARD, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
 21: M*/

 23: PetscErrorCode VecDestroy_MPICUDA(Vec v)
 24: {
 25:   Vec_MPI        *vecmpi = (Vec_MPI*)v->data;
 26:   Vec_CUDA       *veccuda;
 28:   cudaError_t    err;

 31:   if (v->spptr) {
 32:     veccuda = (Vec_CUDA*)v->spptr;
 33:     if (veccuda->GPUarray_allocated) {
 34:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
 35:       veccuda->GPUarray_allocated = NULL;
 36:     }
 37:     if (veccuda->stream) {
 38:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
 39:     }
 40:     if (v->pinned_memory) {
 41:       PetscMallocSetCUDAHost();
 42:       PetscFree(vecmpi->array_allocated);
 43:       PetscMallocResetCUDAHost();
 44:       v->pinned_memory = PETSC_FALSE;
 45:     }
 46:     PetscFree(v->spptr);
 47:   }
 48:   VecDestroy_MPI(v);
 49:   return(0);
 50: }

 52: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
 53: {
 54:   PetscReal      sum,work = 0.0;

 58:   if (type == NORM_2 || type == NORM_FROBENIUS) {
 59:     VecNorm_SeqCUDA(xin,NORM_2,&work);
 60:     work *= work;
 61:     MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 62:     *z    = PetscSqrtReal(sum);
 63:   } else if (type == NORM_1) {
 64:     /* Find the local part */
 65:     VecNorm_SeqCUDA(xin,NORM_1,&work);
 66:     /* Find the global max */
 67:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 68:   } else if (type == NORM_INFINITY) {
 69:     /* Find the local max */
 70:     VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
 71:     /* Find the global max */
 72:     MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
 73:   } else if (type == NORM_1_AND_2) {
 74:     PetscReal temp[2];
 75:     VecNorm_SeqCUDA(xin,NORM_1,temp);
 76:     VecNorm_SeqCUDA(xin,NORM_2,temp+1);
 77:     temp[1] = temp[1]*temp[1];
 78:     MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 79:     z[1] = PetscSqrtReal(z[1]);
 80:   }
 81:   return(0);
 82: }

 84: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 85: {
 86:   PetscScalar    sum,work;

 90:   VecDot_SeqCUDA(xin,yin,&work);
 91:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
 92:   *z   = sum;
 93:   return(0);
 94: }

 96: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
 97: {
 98:   PetscScalar    sum,work;

102:   VecTDot_SeqCUDA(xin,yin,&work);
103:   MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
104:   *z   = sum;
105:   return(0);
106: }

108: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
109: {
110:   PetscScalar    awork[128],*work = awork;

114:   if (nv > 128) {
115:     PetscMalloc1(nv,&work);
116:   }
117:   VecMDot_SeqCUDA(xin,nv,y,work);
118:   MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
119:   if (nv > 128) {
120:     PetscFree(work);
121:   }
122:   return(0);
123: }

125: /*MC
126:    VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA

128:    Options Database Keys:
129: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()

131:   Level: beginner

133: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
134: M*/


137: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
138: {
140:   Vec_MPI        *vw,*w = (Vec_MPI*)win->data;
141:   PetscScalar    *array;

144:   VecCreate(PetscObjectComm((PetscObject)win),v);
145:   PetscLayoutReference(win->map,&(*v)->map);

147:   VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
148:   vw   = (Vec_MPI*)(*v)->data;
149:   PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));

151:   /* save local representation of the parallel vector (and scatter) if it exists */
152:   if (w->localrep) {
153:     VecGetArray(*v,&array);
154:     VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
155:     PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
156:     VecRestoreArray(*v,&array);
157:     PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
158:     vw->localupdate = w->localupdate;
159:     if (vw->localupdate) {
160:       PetscObjectReference((PetscObject)vw->localupdate);
161:     }
162:   }

164:   /* New vector should inherit stashing property of parent */
165:   (*v)->stash.donotstash   = win->stash.donotstash;
166:   (*v)->stash.ignorenegidx = win->stash.ignorenegidx;

168:   /* change type_name appropriately */
169:   VecCUDAAllocateCheck(*v);
170:   PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);

172:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
173:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
174:   (*v)->map->bs   = PetscAbs(win->map->bs);
175:   (*v)->bstash.bs = win->bstash.bs;
176:   return(0);
177: }

179: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
180: {
182:   PetscScalar    work[2],sum[2];

185:   VecDotNorm2_SeqCUDA(s,t,work,work+1);
186:   MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
187:   *dp  = sum[0];
188:   *nm  = sum[1];
189:   return(0);
190: }

192: PetscErrorCode VecCreate_MPICUDA(Vec vv)
193: {

197:   PetscCUDAInitializeCheck();
198:   PetscLayoutSetUp(vv->map);
199:   VecCUDAAllocateCheck(vv);
200:   VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
201:   VecCUDAAllocateCheckHost(vv);
202:   VecSet(vv,0.0);
203:   VecSet_Seq(vv,0.0);
204:   vv->offloadmask = PETSC_OFFLOAD_BOTH;
205:   return(0);
206: }

208: PetscErrorCode VecCreate_CUDA(Vec v)
209: {
211:   PetscMPIInt    size;

214:   MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
215:   if (size == 1) {
216:     VecSetType(v,VECSEQCUDA);
217:   } else {
218:     VecSetType(v,VECMPICUDA);
219:   }
220:   return(0);
221: }

223: /*@C
224:    VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
225:    where the user provides the GPU array space to store the vector values.

227:    Collective

229:    Input Parameters:
230: +  comm  - the MPI communicator to use
231: .  bs    - block size, same meaning as VecSetBlockSize()
232: .  n     - local vector length, cannot be PETSC_DECIDE
233: .  N     - global vector length (or PETSC_DECIDE to have calculated)
234: -  array - the user provided GPU array to store the vector values

236:    Output Parameter:
237: .  vv - the vector

239:    Notes:
240:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
241:    same type as an existing vector.

243:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
244:    at a later stage to SET the array for storing the vector values.

246:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
247:    The user should not free the array until the vector is destroyed.

249:    Level: intermediate

251: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
252:           VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
253:           VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()

255: @*/
256: PetscErrorCode  VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
257: {

261:   if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
262:   PetscCUDAInitializeCheck();
263:   VecCreate(comm,vv);
264:   VecSetSizes(*vv,n,N);
265:   VecSetBlockSize(*vv,bs);
266:   VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
267:   return(0);
268: }

270: /*@C
271:    VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
272:    where the user provides the GPU array space to store the vector values.

274:    Collective

276:    Input Parameters:
277: +  comm  - the MPI communicator to use
278: .  bs    - block size, same meaning as VecSetBlockSize()
279: .  n     - local vector length, cannot be PETSC_DECIDE
280: .  N     - global vector length (or PETSC_DECIDE to have calculated)
281: -  cpuarray - the user provided CPU array to store the vector values
282: -  gpuarray - the user provided GPU array to store the vector values

284:    Output Parameter:
285: .  vv - the vector

287:    Notes:
288:    If both cpuarray and gpuarray are provided, the caller must ensure that
289:    the provided arrays have identical values.

291:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
292:    same type as an existing vector.

294:    PETSc does NOT free the provided arrays when the vector is destroyed via
295:    VecDestroy(). The user should not free the array until the vector is
296:    destroyed.

298:    Level: intermediate

300: .seealso: VecCreateSeqCUDAWithArrays(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
301:           VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
302:           VecCreateMPI(), VecCreateGhostWithArray(), VecCUDAPlaceArray(), VecPlaceArray(),
303:           VecCUDAAllocateCheckHost()
304: @*/
305: PetscErrorCode  VecCreateMPICUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *vv)
306: {

310:   VecCreateMPICUDAWithArray(comm,bs,n,N,gpuarray,vv);

312:   if (cpuarray && gpuarray) {
313:     Vec_MPI *s         = (Vec_MPI*)((*vv)->data);
314:     s->array           = (PetscScalar*)cpuarray;
315:     (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
316:   } else if (cpuarray) {
317:     Vec_MPI *s         = (Vec_MPI*)((*vv)->data);
318:     s->array           = (PetscScalar*)cpuarray;
319:     (*vv)->offloadmask =  PETSC_OFFLOAD_CPU;
320:   } else if (gpuarray) {
321:     (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
322:   } else {
323:     (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
324:   }

326:   return(0);
327: }

329: PetscErrorCode VecBindToCPU_MPICUDA(Vec V,PetscBool pin)
330: {

334:   V->boundtocpu = pin;
335:   if (pin) {
336:     VecCUDACopyFromGPU(V);
337:     V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
338:     V->ops->dotnorm2               = NULL;
339:     V->ops->waxpy                  = VecWAXPY_Seq;
340:     V->ops->dot                    = VecDot_MPI;
341:     V->ops->mdot                   = VecMDot_MPI;
342:     V->ops->tdot                   = VecTDot_MPI;
343:     V->ops->norm                   = VecNorm_MPI;
344:     V->ops->scale                  = VecScale_Seq;
345:     V->ops->copy                   = VecCopy_Seq;
346:     V->ops->set                    = VecSet_Seq;
347:     V->ops->swap                   = VecSwap_Seq;
348:     V->ops->axpy                   = VecAXPY_Seq;
349:     V->ops->axpby                  = VecAXPBY_Seq;
350:     V->ops->maxpy                  = VecMAXPY_Seq;
351:     V->ops->aypx                   = VecAYPX_Seq;
352:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
353:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
354:     V->ops->setrandom              = VecSetRandom_Seq;
355:     V->ops->placearray             = VecPlaceArray_Seq;
356:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
357:     V->ops->resetarray             = VecResetArray_Seq;
358:     V->ops->dot_local              = VecDot_Seq;
359:     V->ops->tdot_local             = VecTDot_Seq;
360:     V->ops->norm_local             = VecNorm_Seq;
361:     V->ops->mdot_local             = VecMDot_Seq;
362:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
363:     V->ops->getlocalvector         = NULL;
364:     V->ops->restorelocalvector     = NULL;
365:     V->ops->getlocalvectorread     = NULL;
366:     V->ops->restorelocalvectorread = NULL;
367:     V->ops->getarraywrite          = NULL;
368:   } else {
369:     V->ops->dotnorm2               = VecDotNorm2_MPICUDA;
370:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
371:     V->ops->duplicate              = VecDuplicate_MPICUDA;
372:     V->ops->dot                    = VecDot_MPICUDA;
373:     V->ops->mdot                   = VecMDot_MPICUDA;
374:     V->ops->tdot                   = VecTDot_MPICUDA;
375:     V->ops->norm                   = VecNorm_MPICUDA;
376:     V->ops->scale                  = VecScale_SeqCUDA;
377:     V->ops->copy                   = VecCopy_SeqCUDA;
378:     V->ops->set                    = VecSet_SeqCUDA;
379:     V->ops->swap                   = VecSwap_SeqCUDA;
380:     V->ops->axpy                   = VecAXPY_SeqCUDA;
381:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
382:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
383:     V->ops->aypx                   = VecAYPX_SeqCUDA;
384:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
385:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
386:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
387:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
388:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
389:     V->ops->resetarray             = VecResetArray_SeqCUDA;
390:     V->ops->dot_local              = VecDot_SeqCUDA;
391:     V->ops->tdot_local             = VecTDot_SeqCUDA;
392:     V->ops->norm_local             = VecNorm_SeqCUDA;
393:     V->ops->mdot_local             = VecMDot_SeqCUDA;
394:     V->ops->destroy                = VecDestroy_MPICUDA;
395:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
396:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
397:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
398:     V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
399:     V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
400:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
401:   }
402:   return(0);
403: }

405: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
406: {
408:   Vec_CUDA       *veccuda;

411:   VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
412:   PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);

414:   VecBindToCPU_MPICUDA(vv,PETSC_FALSE);
415:   vv->ops->bindtocpu = VecBindToCPU_MPICUDA;

417:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
418:   if (alloc && !array) {
419:     VecCUDAAllocateCheck(vv);
420:     VecCUDAAllocateCheckHost(vv);
421:     VecSet(vv,0.0);
422:     VecSet_Seq(vv,0.0);
423:     vv->offloadmask = PETSC_OFFLOAD_BOTH;
424:   }
425:   if (array) {
426:     if (!vv->spptr) {
427:       PetscReal pinned_memory_min;
428:       PetscBool flag;
429:       /* Cannot use PetscNew() here because spptr is void* */
430:       PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
431:       veccuda = (Vec_CUDA*)vv->spptr;
432:       veccuda->stream = 0; /* using default stream */
433:       veccuda->GPUarray_allocated = 0;
434:       vv->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
435:       vv->minimum_bytes_pinned_memory = 0;

437:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
438:          Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
439:       PetscOptionsBegin(PetscObjectComm((PetscObject)vv),((PetscObject)vv)->prefix,"VECCUDA Options","Vec");
440:       pinned_memory_min = vv->minimum_bytes_pinned_memory;
441:       PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&flag);
442:       if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
443:       PetscOptionsEnd();
444:     }
445:     veccuda = (Vec_CUDA*)vv->spptr;
446:     veccuda->GPUarray = (PetscScalar*)array;
447:   }
448:   return(0);
449: }