Actual source code: veccuda.c
petsc-3.14.6 2021-03-30
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h>
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <petsc/private/cudavecimpl.h>
16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
17: {
20: if (x) {
21: Vec_Seq *h = (Vec_Seq*)v->data;
23: *x = h->array;
24: }
25: if (x_d) {
26: Vec_CUDA *d = (Vec_CUDA*)v->spptr;
28: *x_d = d ? d->GPUarray : NULL;
29: }
30: if (flg) *flg = v->offloadmask;
31: return(0);
32: }
34: /*
35: Allocates space for the vector array on the Host if it does not exist.
36: Does NOT change the PetscCUDAFlag for the vector
37: Does NOT zero the CUDA array
38: */
39: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
40: {
42: PetscScalar *array;
43: Vec_Seq *s = (Vec_Seq*)v->data;
44: PetscInt n = v->map->n;
47: if (!s) {
48: PetscNewLog((PetscObject)v,&s);
49: v->data = s;
50: }
51: if (!s->array) {
52: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
53: PetscMallocSetCUDAHost();
54: v->pinned_memory = PETSC_TRUE;
55: }
56: PetscMalloc1(n,&array);
57: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
58: s->array = array;
59: s->array_allocated = array;
60: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
61: PetscMallocResetCUDAHost();
62: }
63: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
64: v->offloadmask = PETSC_OFFLOAD_CPU;
65: }
66: }
67: return(0);
68: }
70: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
71: {
72: PetscScalar *ya;
73: const PetscScalar *xa;
74: PetscErrorCode ierr;
77: VecCUDAAllocateCheckHost(xin);
78: VecCUDAAllocateCheckHost(yin);
79: if (xin != yin) {
80: VecGetArrayRead(xin,&xa);
81: VecGetArray(yin,&ya);
82: PetscArraycpy(ya,xa,xin->map->n);
83: VecRestoreArrayRead(xin,&xa);
84: VecRestoreArray(yin,&ya);
85: }
86: return(0);
87: }
89: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
90: {
92: PetscInt n = xin->map->n,i;
93: PetscScalar *xx;
96: VecGetArray(xin,&xx);
97: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
98: VecRestoreArray(xin,&xx);
99: return(0);
100: }
102: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
103: {
104: Vec_Seq *vs = (Vec_Seq*)v->data;
108: PetscObjectSAWsViewOff(v);
109: #if defined(PETSC_USE_LOG)
110: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
111: #endif
112: if (vs) {
113: if (vs->array_allocated) {
114: if (v->pinned_memory) {
115: PetscMallocSetCUDAHost();
116: }
117: PetscFree(vs->array_allocated);
118: if (v->pinned_memory) {
119: PetscMallocResetCUDAHost();
120: v->pinned_memory = PETSC_FALSE;
121: }
122: }
123: PetscFree(vs);
124: }
125: return(0);
126: }
128: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
129: {
130: Vec_Seq *v = (Vec_Seq*)vin->data;
133: v->array = v->unplacedarray;
134: v->unplacedarray = 0;
135: return(0);
136: }
138: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
139: {
143: VecCUDAAllocateCheck(v);
144: return(0);
145: }
147: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
148: {
152: VecCUDACopyToGPU(v);
153: return(0);
154: }
156: /*
157: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
159: Input Parameters:
160: + v - the vector
161: . ci - the requested indices, this should be created with CUDAIndicesCreate()
162: - mode - vec scatter mode used in VecScatterBegin/End
163: */
164: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
165: {
169: VecCUDACopyToGPUSome(v,ci,mode);
170: return(0);
171: }
173: /*
174: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
176: Input Parameters:
177: + v - the vector
178: . ci - the requested indices, this should be created with CUDAIndicesCreate()
179: - mode - vec scatter mode used in VecScatterBegin/End
180: */
181: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
182: {
186: VecCUDACopyFromGPUSome(v,ci,mode);
187: return(0);
188: }
190: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
191: {
195: VecSetRandom_SeqCUDA_Private(xin,r);
196: xin->offloadmask = PETSC_OFFLOAD_CPU;
197: return(0);
198: }
200: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
201: {
205: VecCUDACopyFromGPU(vin);
206: VecResetArray_SeqCUDA_Private(vin);
207: vin->offloadmask = PETSC_OFFLOAD_CPU;
208: return(0);
209: }
211: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
212: {
216: VecCUDACopyFromGPU(vin);
217: VecPlaceArray_Seq(vin,a);
218: vin->offloadmask = PETSC_OFFLOAD_CPU;
219: return(0);
220: }
222: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
223: {
225: Vec_Seq *vs = (Vec_Seq*)vin->data;
228: if (vs->array != vs->array_allocated) {
229: /* make sure the users array has the latest values */
230: VecCUDACopyFromGPU(vin);
231: }
232: if (vs->array_allocated) {
233: if (vin->pinned_memory) {
234: PetscMallocSetCUDAHost();
235: }
236: PetscFree(vs->array_allocated);
237: if (vin->pinned_memory) {
238: PetscMallocResetCUDAHost();
239: }
240: }
241: vin->pinned_memory = PETSC_FALSE;
242: vs->array_allocated = vs->array = (PetscScalar*)a;
243: vin->offloadmask = PETSC_OFFLOAD_CPU;
244: return(0);
245: }
247: /*@
248: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
250: Collective
252: Input Parameter:
253: + comm - the communicator, should be PETSC_COMM_SELF
254: - n - the vector length
256: Output Parameter:
257: . v - the vector
259: Notes:
260: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
261: same type as an existing vector.
263: Level: intermediate
265: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
266: @*/
267: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
268: {
272: VecCreate(comm,v);
273: VecSetSizes(*v,n,n);
274: VecSetType(*v,VECSEQCUDA);
275: return(0);
276: }
278: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
279: {
283: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
284: PetscLayoutReference(win->map,&(*V)->map);
285: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
286: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
287: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
288: return(0);
289: }
291: PetscErrorCode VecCreate_SeqCUDA(Vec V)
292: {
296: PetscCUDAInitializeCheck();
297: PetscLayoutSetUp(V->map);
298: VecCUDAAllocateCheck(V);
299: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
300: VecCUDAAllocateCheckHost(V);
301: VecSet(V,0.0);
302: VecSet_Seq(V,0.0);
303: V->offloadmask = PETSC_OFFLOAD_BOTH;
304: return(0);
305: }
307: /*@C
308: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
309: where the user provides the array space to store the vector values. The array
310: provided must be a GPU array.
312: Collective
314: Input Parameter:
315: + comm - the communicator, should be PETSC_COMM_SELF
316: . bs - the block size
317: . n - the vector length
318: - array - GPU memory where the vector elements are to be stored.
320: Output Parameter:
321: . V - the vector
323: Notes:
324: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
325: same type as an existing vector.
327: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
328: at a later stage to SET the array for storing the vector values.
330: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
331: The user should not free the array until the vector is destroyed.
333: Level: intermediate
335: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
336: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
337: VecCreateMPIWithArray()
338: @*/
339: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
340: {
344: PetscCUDAInitializeCheck();
345: VecCreate(comm,V);
346: VecSetSizes(*V,n,n);
347: VecSetBlockSize(*V,bs);
348: VecCreate_SeqCUDA_Private(*V,array);
349: return(0);
350: }
352: /*@C
353: VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
354: where the user provides the array space to store the vector values.
356: Collective
358: Input Parameter:
359: + comm - the communicator, should be PETSC_COMM_SELF
360: . bs - the block size
361: . n - the vector length
362: - cpuarray - CPU memory where the vector elements are to be stored.
363: - gpuarray - GPU memory where the vector elements are to be stored.
365: Output Parameter:
366: . V - the vector
368: Notes:
369: If both cpuarray and gpuarray are provided, the caller must ensure that
370: the provided arrays have identical values.
372: PETSc does NOT free the provided arrays when the vector is destroyed via
373: VecDestroy(). The user should not free the array until the vector is
374: destroyed.
376: Level: intermediate
378: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
379: VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
380: VecCUDAAllocateCheckHost()
381: @*/
382: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
383: {
387: // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
388: VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);
390: if (cpuarray && gpuarray) {
391: Vec_Seq *s = (Vec_Seq*)((*V)->data);
392: s->array = (PetscScalar*)cpuarray;
393: (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
394: } else if (cpuarray) {
395: Vec_Seq *s = (Vec_Seq*)((*V)->data);
396: s->array = (PetscScalar*)cpuarray;
397: (*V)->offloadmask = PETSC_OFFLOAD_CPU;
398: } else if (gpuarray) {
399: (*V)->offloadmask = PETSC_OFFLOAD_GPU;
400: } else {
401: (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
402: }
404: return(0);
405: }
407: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
408: {
412: VecCUDAAllocateCheckHost(v);
413: v->offloadmask = PETSC_OFFLOAD_CPU;
414: *vv = *((PetscScalar**)v->data);
415: return(0);
416: }
418: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
419: {
423: V->boundtocpu = pin;
424: if (pin) {
425: VecCUDACopyFromGPU(V);
426: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
427: V->ops->dot = VecDot_Seq;
428: V->ops->norm = VecNorm_Seq;
429: V->ops->tdot = VecTDot_Seq;
430: V->ops->scale = VecScale_Seq;
431: V->ops->copy = VecCopy_Seq;
432: V->ops->set = VecSet_Seq;
433: V->ops->swap = VecSwap_Seq;
434: V->ops->axpy = VecAXPY_Seq;
435: V->ops->axpby = VecAXPBY_Seq;
436: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
437: V->ops->pointwisemult = VecPointwiseMult_Seq;
438: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
439: V->ops->setrandom = VecSetRandom_Seq;
440: V->ops->dot_local = VecDot_Seq;
441: V->ops->tdot_local = VecTDot_Seq;
442: V->ops->norm_local = VecNorm_Seq;
443: V->ops->mdot_local = VecMDot_Seq;
444: V->ops->mtdot_local = VecMTDot_Seq;
445: V->ops->maxpy = VecMAXPY_Seq;
446: V->ops->mdot = VecMDot_Seq;
447: V->ops->mtdot = VecMTDot_Seq;
448: V->ops->aypx = VecAYPX_Seq;
449: V->ops->waxpy = VecWAXPY_Seq;
450: V->ops->dotnorm2 = NULL;
451: V->ops->placearray = VecPlaceArray_Seq;
452: V->ops->replacearray = VecReplaceArray_SeqCUDA;
453: V->ops->resetarray = VecResetArray_Seq;
454: V->ops->duplicate = VecDuplicate_Seq;
455: V->ops->conjugate = VecConjugate_Seq;
456: V->ops->getlocalvector = NULL;
457: V->ops->restorelocalvector = NULL;
458: V->ops->getlocalvectorread = NULL;
459: V->ops->restorelocalvectorread = NULL;
460: V->ops->getarraywrite = NULL;
461: } else {
462: V->ops->dot = VecDot_SeqCUDA;
463: V->ops->norm = VecNorm_SeqCUDA;
464: V->ops->tdot = VecTDot_SeqCUDA;
465: V->ops->scale = VecScale_SeqCUDA;
466: V->ops->copy = VecCopy_SeqCUDA;
467: V->ops->set = VecSet_SeqCUDA;
468: V->ops->swap = VecSwap_SeqCUDA;
469: V->ops->axpy = VecAXPY_SeqCUDA;
470: V->ops->axpby = VecAXPBY_SeqCUDA;
471: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
472: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
473: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
474: V->ops->setrandom = VecSetRandom_SeqCUDA;
475: V->ops->dot_local = VecDot_SeqCUDA;
476: V->ops->tdot_local = VecTDot_SeqCUDA;
477: V->ops->norm_local = VecNorm_SeqCUDA;
478: V->ops->mdot_local = VecMDot_SeqCUDA;
479: V->ops->maxpy = VecMAXPY_SeqCUDA;
480: V->ops->mdot = VecMDot_SeqCUDA;
481: V->ops->aypx = VecAYPX_SeqCUDA;
482: V->ops->waxpy = VecWAXPY_SeqCUDA;
483: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
484: V->ops->placearray = VecPlaceArray_SeqCUDA;
485: V->ops->replacearray = VecReplaceArray_SeqCUDA;
486: V->ops->resetarray = VecResetArray_SeqCUDA;
487: V->ops->destroy = VecDestroy_SeqCUDA;
488: V->ops->duplicate = VecDuplicate_SeqCUDA;
489: V->ops->conjugate = VecConjugate_SeqCUDA;
490: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
491: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
492: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
493: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
494: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
495: }
496: return(0);
497: }
499: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
500: {
502: Vec_CUDA *veccuda;
503: PetscMPIInt size;
504: PetscBool option_set;
507: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
508: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
509: VecCreate_Seq_Private(V,0);
510: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
511: VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
512: V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
514: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
515: if (array) {
516: if (!V->spptr) {
517: PetscReal pinned_memory_min;
518: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
519: veccuda = (Vec_CUDA*)V->spptr;
520: veccuda->stream = 0; /* using default stream */
521: veccuda->GPUarray_allocated = 0;
522: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
524: pinned_memory_min = 0;
525: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
526: Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
527: PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
528: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
529: if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
530: PetscOptionsEnd();
531: }
532: veccuda = (Vec_CUDA*)V->spptr;
533: veccuda->GPUarray = (PetscScalar*)array;
534: V->offloadmask = PETSC_OFFLOAD_GPU;
536: }
537: return(0);
538: }