Actual source code: veccuda.c
petsc-3.10.5 2019-03-28
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsccuda.h>
13: #include <petsc/private/vecimpl.h> /*I <petscvec.h> I*/
14: #include <../src/vec/vec/impls/dvecimpl.h>
15: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
17: static PetscErrorCode PetscCUBLASDestroyHandle();
19: /*
20: Implementation for obtaining read-write access to the cuBLAS handle.
21: Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
22: */
23: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
24: {
25: static cublasHandle_t cublasv2handle = NULL;
26: cublasStatus_t cberr;
27: PetscErrorCode ierr;
30: if (!cublasv2handle) {
31: cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
32: /* Make sure that the handle will be destroyed properly */
33: PetscRegisterFinalize(PetscCUBLASDestroyHandle);
34: }
35: *handle = &cublasv2handle;
36: return(0);
37: }
39: /*
40: Singleton for obtaining a handle to cuBLAS.
41: The handle is required for calls to routines in cuBLAS.
42: */
43: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
44: {
45: cublasHandle_t *p_cublasv2handle;
49: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
50: *handle = *p_cublasv2handle;
51: return(0);
52: }
55: /*
56: Destroys the CUBLAS handle.
57: This function is intended and registered for PetscFinalize - do not call manually!
58: */
59: PetscErrorCode PetscCUBLASDestroyHandle()
60: {
61: cublasHandle_t *p_cublasv2handle;
62: cublasStatus_t cberr;
66: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
67: cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
68: *p_cublasv2handle = NULL; /* Ensures proper reinitialization */
69: return(0);
70: }
72: /*
73: Allocates space for the vector array on the Host if it does not exist.
74: Does NOT change the PetscCUDAFlag for the vector
75: Does NOT zero the CUDA array
76: */
77: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
78: {
80: PetscScalar *array;
81: Vec_Seq *s = (Vec_Seq*)v->data;
82: PetscInt n = v->map->n;
85: if (!s) {
86: PetscNewLog((PetscObject)v,&s);
87: v->data = s;
88: }
89: if (!s->array) {
90: PetscMalloc1(n,&array);
91: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
92: s->array = array;
93: s->array_allocated = array;
94: if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
95: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
96: }
97: }
98: return(0);
99: }
101: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
102: {
103: PetscScalar *ya;
104: const PetscScalar *xa;
105: PetscErrorCode ierr;
108: VecCUDAAllocateCheckHost(xin);
109: VecCUDAAllocateCheckHost(yin);
110: if (xin != yin) {
111: VecGetArrayRead(xin,&xa);
112: VecGetArray(yin,&ya);
113: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
114: VecRestoreArrayRead(xin,&xa);
115: VecRestoreArray(yin,&ya);
116: }
117: return(0);
118: }
120: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
121: {
123: PetscInt n = xin->map->n,i;
124: PetscScalar *xx;
127: VecGetArray(xin,&xx);
128: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
129: VecRestoreArray(xin,&xx);
130: return(0);
131: }
133: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
134: {
135: Vec_Seq *vs = (Vec_Seq*)v->data;
139: PetscObjectSAWsViewOff(v);
140: #if defined(PETSC_USE_LOG)
141: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
142: #endif
143: if (vs) {
144: if (vs->array_allocated) { PetscFree(vs->array_allocated); }
145: PetscFree(vs);
146: }
147: return(0);
148: }
150: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
151: {
152: Vec_Seq *v = (Vec_Seq*)vin->data;
155: v->array = v->unplacedarray;
156: v->unplacedarray = 0;
157: return(0);
158: }
160: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
161: {
165: VecCUDAAllocateCheck(v);
166: return(0);
167: }
169: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
170: {
174: VecCUDACopyToGPU(v);
175: return(0);
176: }
178: /*
179: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
181: Input Parameters:
182: + v - the vector
183: . ci - the requested indices, this should be created with CUDAIndicesCreate()
184: - mode - vec scatter mode used in VecScatterBegin/End
185: */
186: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
187: {
191: VecCUDACopyToGPUSome(v,ci,mode);
192: return(0);
193: }
195: /*
196: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
198: Input Parameters:
199: + v - the vector
200: . ci - the requested indices, this should be created with CUDAIndicesCreate()
201: - mode - vec scatter mode used in VecScatterBegin/End
202: */
203: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
204: {
208: VecCUDACopyFromGPUSome(v,ci,mode);
209: return(0);
210: }
212: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
213: {
217: VecSetRandom_SeqCUDA_Private(xin,r);
218: xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
219: return(0);
220: }
222: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
223: {
227: VecCUDACopyFromGPU(vin);
228: VecResetArray_SeqCUDA_Private(vin);
229: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
230: return(0);
231: }
233: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
234: {
238: VecCUDACopyFromGPU(vin);
239: VecPlaceArray_Seq(vin,a);
240: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
241: return(0);
242: }
244: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
245: {
249: VecCUDACopyFromGPU(vin);
250: VecReplaceArray_Seq(vin,a);
251: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
252: return(0);
253: }
255: /*@
256: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
258: Collective on MPI_Comm
260: Input Parameter:
261: + comm - the communicator, should be PETSC_COMM_SELF
262: - n - the vector length
264: Output Parameter:
265: . v - the vector
267: Notes:
268: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
269: same type as an existing vector.
271: Level: intermediate
273: Concepts: vectors^creating sequential
275: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
276: @*/
277: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
278: {
282: VecCreate(comm,v);
283: VecSetSizes(*v,n,n);
284: VecSetType(*v,VECSEQCUDA);
285: return(0);
286: }
288: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
289: {
293: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
294: PetscLayoutReference(win->map,&(*V)->map);
295: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
296: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
297: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
298: return(0);
299: }
301: PetscErrorCode VecCreate_SeqCUDA(Vec V)
302: {
306: PetscLayoutSetUp(V->map);
307: VecCUDAAllocateCheck(V);
308: V->valid_GPU_array = PETSC_OFFLOAD_GPU;
309: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
310: VecSet(V,0.0);
311: return(0);
312: }
314: /*@C
315: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
316: where the user provides the array space to store the vector values. The array
317: provided must be a GPU array.
319: Collective on MPI_Comm
321: Input Parameter:
322: + comm - the communicator, should be PETSC_COMM_SELF
323: . bs - the block size
324: . n - the vector length
325: - array - GPU memory where the vector elements are to be stored.
327: Output Parameter:
328: . V - the vector
330: Notes:
331: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
332: same type as an existing vector.
334: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
335: at a later stage to SET the array for storing the vector values.
337: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
338: The user should not free the array until the vector is destroyed.
340: Level: intermediate
342: Concepts: vectors^creating with array
344: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
345: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
346: VecCreateMPIWithArray()
347: @*/
348: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
349: {
351: PetscMPIInt size;
354: VecCreate(comm,V);
355: VecSetSizes(*V,n,n);
356: VecSetBlockSize(*V,bs);
357: MPI_Comm_size(comm,&size);
358: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
359: VecCreate_SeqCUDA_Private(*V,array);
360: return(0);
361: }
363: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
364: {
366: cudaError_t err;
367: Vec_CUDA *veccuda;
368: PetscMPIInt size;
371: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
372: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
373: VecCreate_Seq_Private(V,0);
374: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
376: V->ops->dot = VecDot_SeqCUDA;
377: V->ops->norm = VecNorm_SeqCUDA;
378: V->ops->tdot = VecTDot_SeqCUDA;
379: V->ops->scale = VecScale_SeqCUDA;
380: V->ops->copy = VecCopy_SeqCUDA;
381: V->ops->set = VecSet_SeqCUDA;
382: V->ops->swap = VecSwap_SeqCUDA;
383: V->ops->axpy = VecAXPY_SeqCUDA;
384: V->ops->axpby = VecAXPBY_SeqCUDA;
385: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
386: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
387: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
388: V->ops->setrandom = VecSetRandom_SeqCUDA;
389: V->ops->dot_local = VecDot_SeqCUDA;
390: V->ops->tdot_local = VecTDot_SeqCUDA;
391: V->ops->norm_local = VecNorm_SeqCUDA;
392: V->ops->mdot_local = VecMDot_SeqCUDA;
393: V->ops->maxpy = VecMAXPY_SeqCUDA;
394: V->ops->mdot = VecMDot_SeqCUDA;
395: V->ops->aypx = VecAYPX_SeqCUDA;
396: V->ops->waxpy = VecWAXPY_SeqCUDA;
397: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
398: V->ops->placearray = VecPlaceArray_SeqCUDA;
399: V->ops->replacearray = VecReplaceArray_SeqCUDA;
400: V->ops->resetarray = VecResetArray_SeqCUDA;
401: V->ops->destroy = VecDestroy_SeqCUDA;
402: V->ops->duplicate = VecDuplicate_SeqCUDA;
403: V->ops->conjugate = VecConjugate_SeqCUDA;
404: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
405: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
406: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
407: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
409: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
410: if (array) {
411: if (!V->spptr) {
412: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
413: veccuda = (Vec_CUDA*)V->spptr;
414: err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
415: veccuda->GPUarray_allocated = 0;
416: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
417: V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
418: }
419: veccuda = (Vec_CUDA*)V->spptr;
420: veccuda->GPUarray = (PetscScalar*)array;
421: }
423: return(0);
424: }