Actual source code: veccuda.c
petsc-3.13.6 2020-09-29
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h>
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <petsc/private/cudavecimpl.h>
16: /*
17: Allocates space for the vector array on the Host if it does not exist.
18: Does NOT change the PetscCUDAFlag for the vector
19: Does NOT zero the CUDA array
20: */
21: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
22: {
24: PetscScalar *array;
25: Vec_Seq *s = (Vec_Seq*)v->data;
26: PetscInt n = v->map->n;
29: if (!s) {
30: PetscNewLog((PetscObject)v,&s);
31: v->data = s;
32: }
33: if (!s->array) {
34: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
35: PetscMallocSetCUDAHost();
36: v->pinned_memory = PETSC_TRUE;
37: }
38: PetscMalloc1(n,&array);
39: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
40: s->array = array;
41: s->array_allocated = array;
42: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
43: PetscMallocResetCUDAHost();
44: }
45: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
46: v->offloadmask = PETSC_OFFLOAD_CPU;
47: }
48: }
49: return(0);
50: }
52: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
53: {
54: PetscScalar *ya;
55: const PetscScalar *xa;
56: PetscErrorCode ierr;
59: VecCUDAAllocateCheckHost(xin);
60: VecCUDAAllocateCheckHost(yin);
61: if (xin != yin) {
62: VecGetArrayRead(xin,&xa);
63: VecGetArray(yin,&ya);
64: PetscArraycpy(ya,xa,xin->map->n);
65: VecRestoreArrayRead(xin,&xa);
66: VecRestoreArray(yin,&ya);
67: }
68: return(0);
69: }
71: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
72: {
74: PetscInt n = xin->map->n,i;
75: PetscScalar *xx;
78: VecGetArray(xin,&xx);
79: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
80: VecRestoreArray(xin,&xx);
81: return(0);
82: }
84: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
85: {
86: Vec_Seq *vs = (Vec_Seq*)v->data;
90: PetscObjectSAWsViewOff(v);
91: #if defined(PETSC_USE_LOG)
92: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
93: #endif
94: if (vs) {
95: if (vs->array_allocated) {
96: if (v->pinned_memory) {
97: PetscMallocSetCUDAHost();
98: }
99: PetscFree(vs->array_allocated);
100: if (v->pinned_memory) {
101: PetscMallocResetCUDAHost();
102: v->pinned_memory = PETSC_FALSE;
103: }
104: }
105: PetscFree(vs);
106: }
107: return(0);
108: }
110: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
111: {
112: Vec_Seq *v = (Vec_Seq*)vin->data;
115: v->array = v->unplacedarray;
116: v->unplacedarray = 0;
117: return(0);
118: }
120: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
121: {
125: VecCUDAAllocateCheck(v);
126: return(0);
127: }
129: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
130: {
134: VecCUDACopyToGPU(v);
135: return(0);
136: }
138: /*
139: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
141: Input Parameters:
142: + v - the vector
143: . ci - the requested indices, this should be created with CUDAIndicesCreate()
144: - mode - vec scatter mode used in VecScatterBegin/End
145: */
146: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
147: {
151: VecCUDACopyToGPUSome(v,ci,mode);
152: return(0);
153: }
155: /*
156: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
158: Input Parameters:
159: + v - the vector
160: . ci - the requested indices, this should be created with CUDAIndicesCreate()
161: - mode - vec scatter mode used in VecScatterBegin/End
162: */
163: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
164: {
168: VecCUDACopyFromGPUSome(v,ci,mode);
169: return(0);
170: }
172: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
173: {
177: VecSetRandom_SeqCUDA_Private(xin,r);
178: xin->offloadmask = PETSC_OFFLOAD_CPU;
179: return(0);
180: }
182: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
183: {
187: VecCUDACopyFromGPU(vin);
188: VecResetArray_SeqCUDA_Private(vin);
189: vin->offloadmask = PETSC_OFFLOAD_CPU;
190: return(0);
191: }
193: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
194: {
198: VecCUDACopyFromGPU(vin);
199: VecPlaceArray_Seq(vin,a);
200: vin->offloadmask = PETSC_OFFLOAD_CPU;
201: return(0);
202: }
204: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
205: {
209: VecCUDACopyFromGPU(vin);
210: VecReplaceArray_Seq(vin,a);
211: vin->offloadmask = PETSC_OFFLOAD_CPU;
212: return(0);
213: }
215: /*@
216: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
218: Collective
220: Input Parameter:
221: + comm - the communicator, should be PETSC_COMM_SELF
222: - n - the vector length
224: Output Parameter:
225: . v - the vector
227: Notes:
228: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
229: same type as an existing vector.
231: Level: intermediate
233: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
234: @*/
235: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
236: {
240: VecCreate(comm,v);
241: VecSetSizes(*v,n,n);
242: VecSetType(*v,VECSEQCUDA);
243: return(0);
244: }
246: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
247: {
251: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
252: PetscLayoutReference(win->map,&(*V)->map);
253: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
254: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
255: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
256: return(0);
257: }
259: PetscErrorCode VecCreate_SeqCUDA(Vec V)
260: {
264: PetscLayoutSetUp(V->map);
265: VecCUDAAllocateCheck(V);
266: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
267: VecCUDAAllocateCheckHost(V);
268: VecSet(V,0.0);
269: VecSet_Seq(V,0.0);
270: V->offloadmask = PETSC_OFFLOAD_BOTH;
271: return(0);
272: }
274: /*@C
275: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
276: where the user provides the array space to store the vector values. The array
277: provided must be a GPU array.
279: Collective
281: Input Parameter:
282: + comm - the communicator, should be PETSC_COMM_SELF
283: . bs - the block size
284: . n - the vector length
285: - array - GPU memory where the vector elements are to be stored.
287: Output Parameter:
288: . V - the vector
290: Notes:
291: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
292: same type as an existing vector.
294: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
295: at a later stage to SET the array for storing the vector values.
297: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
298: The user should not free the array until the vector is destroyed.
300: Level: intermediate
302: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
303: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
304: VecCreateMPIWithArray()
305: @*/
306: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
307: {
309: PetscMPIInt size;
312: VecCreate(comm,V);
313: VecSetSizes(*V,n,n);
314: VecSetBlockSize(*V,bs);
315: MPI_Comm_size(comm,&size);
316: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
317: VecCreate_SeqCUDA_Private(*V,array);
318: (*V)->offloadmask = PETSC_OFFLOAD_GPU;
319: return(0);
320: }
322: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
323: {
327: VecCUDAAllocateCheckHost(v);
328: v->offloadmask = PETSC_OFFLOAD_CPU;
329: *vv = *((PetscScalar**)v->data);
330: return(0);
331: }
333: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool pin)
334: {
338: V->boundtocpu = pin;
339: if (pin) {
340: VecCUDACopyFromGPU(V);
341: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
342: V->ops->dot = VecDot_Seq;
343: V->ops->norm = VecNorm_Seq;
344: V->ops->tdot = VecTDot_Seq;
345: V->ops->scale = VecScale_Seq;
346: V->ops->copy = VecCopy_Seq;
347: V->ops->set = VecSet_Seq;
348: V->ops->swap = VecSwap_Seq;
349: V->ops->axpy = VecAXPY_Seq;
350: V->ops->axpby = VecAXPBY_Seq;
351: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
352: V->ops->pointwisemult = VecPointwiseMult_Seq;
353: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
354: V->ops->setrandom = VecSetRandom_Seq;
355: V->ops->dot_local = VecDot_Seq;
356: V->ops->tdot_local = VecTDot_Seq;
357: V->ops->norm_local = VecNorm_Seq;
358: V->ops->mdot_local = VecMDot_Seq;
359: V->ops->mtdot_local = VecMTDot_Seq;
360: V->ops->maxpy = VecMAXPY_Seq;
361: V->ops->mdot = VecMDot_Seq;
362: V->ops->mtdot = VecMTDot_Seq;
363: V->ops->aypx = VecAYPX_Seq;
364: V->ops->waxpy = VecWAXPY_Seq;
365: V->ops->dotnorm2 = NULL;
366: V->ops->placearray = VecPlaceArray_Seq;
367: V->ops->replacearray = VecReplaceArray_Seq;
368: V->ops->resetarray = VecResetArray_Seq;
369: V->ops->duplicate = VecDuplicate_Seq;
370: V->ops->conjugate = VecConjugate_Seq;
371: V->ops->getlocalvector = NULL;
372: V->ops->restorelocalvector = NULL;
373: V->ops->getlocalvectorread = NULL;
374: V->ops->restorelocalvectorread = NULL;
375: V->ops->getarraywrite = NULL;
376: } else {
377: V->ops->dot = VecDot_SeqCUDA;
378: V->ops->norm = VecNorm_SeqCUDA;
379: V->ops->tdot = VecTDot_SeqCUDA;
380: V->ops->scale = VecScale_SeqCUDA;
381: V->ops->copy = VecCopy_SeqCUDA;
382: V->ops->set = VecSet_SeqCUDA;
383: V->ops->swap = VecSwap_SeqCUDA;
384: V->ops->axpy = VecAXPY_SeqCUDA;
385: V->ops->axpby = VecAXPBY_SeqCUDA;
386: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
387: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
388: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
389: V->ops->setrandom = VecSetRandom_SeqCUDA;
390: V->ops->dot_local = VecDot_SeqCUDA;
391: V->ops->tdot_local = VecTDot_SeqCUDA;
392: V->ops->norm_local = VecNorm_SeqCUDA;
393: V->ops->mdot_local = VecMDot_SeqCUDA;
394: V->ops->maxpy = VecMAXPY_SeqCUDA;
395: V->ops->mdot = VecMDot_SeqCUDA;
396: V->ops->aypx = VecAYPX_SeqCUDA;
397: V->ops->waxpy = VecWAXPY_SeqCUDA;
398: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
399: V->ops->placearray = VecPlaceArray_SeqCUDA;
400: V->ops->replacearray = VecReplaceArray_SeqCUDA;
401: V->ops->resetarray = VecResetArray_SeqCUDA;
402: V->ops->destroy = VecDestroy_SeqCUDA;
403: V->ops->duplicate = VecDuplicate_SeqCUDA;
404: V->ops->conjugate = VecConjugate_SeqCUDA;
405: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
406: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
407: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
408: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
409: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
410: }
411: return(0);
412: }
414: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
415: {
417: Vec_CUDA *veccuda;
418: PetscMPIInt size;
419: PetscBool option_set;
422: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
423: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
424: VecCreate_Seq_Private(V,0);
425: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
426: VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
427: V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
429: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
430: if (array) {
431: if (!V->spptr) {
432: PetscReal pinned_memory_min;
433: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
434: veccuda = (Vec_CUDA*)V->spptr;
435: veccuda->stream = 0; /* using default stream */
436: veccuda->GPUarray_allocated = 0;
437: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
439: pinned_memory_min = 0;
440: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
441: Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
442: PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
443: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
444: if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
445: PetscOptionsEnd();
446: }
447: veccuda = (Vec_CUDA*)V->spptr;
448: veccuda->GPUarray = (PetscScalar*)array;
449: }
451: return(0);
452: }