Actual source code: veccuda2.cu
petsc-3.13.6 2020-09-29
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
18: /*
19: Allocates space for the vector array on the GPU if it does not exist.
20: Does NOT change the PetscCUDAFlag for the vector
21: Does NOT zero the CUDA array
23: */
24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
25: {
27: cudaError_t err;
28: Vec_CUDA *veccuda;
29: PetscBool option_set;
32: if (!v->spptr) {
33: PetscReal pinned_memory_min;
34: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
35: veccuda = (Vec_CUDA*)v->spptr;
36: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37: veccuda->GPUarray = veccuda->GPUarray_allocated;
38: veccuda->stream = 0; /* using default stream */
39: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->offloadmask = PETSC_OFFLOAD_CPU;
42: } else {
43: v->offloadmask = PETSC_OFFLOAD_GPU;
44: }
45: }
46: pinned_memory_min = 0;
48: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
51: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
52: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53: PetscOptionsEnd();
54: }
55: return(0);
56: }
58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
59: PetscErrorCode VecCUDACopyToGPU(Vec v)
60: {
62: cudaError_t err;
63: Vec_CUDA *veccuda;
64: PetscScalar *varray;
68: VecCUDAAllocateCheck(v);
69: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
71: veccuda = (Vec_CUDA*)v->spptr;
72: varray = veccuda->GPUarray;
73: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
75: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
76: v->offloadmask = PETSC_OFFLOAD_BOTH;
77: }
78: return(0);
79: }
81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
82: {
83: PetscScalar *varray;
85: cudaError_t err;
86: PetscScalar *cpuPtr, *gpuPtr;
87: Vec_Seq *s;
88: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
89: PetscInt lowestIndex,n;
93: VecCUDAAllocateCheck(v);
94: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95: s = (Vec_Seq*)v->data;
96: if (mode & SCATTER_REVERSE) {
97: lowestIndex = ptop_scatter->sendLowestIndex;
98: n = ptop_scatter->ns;
99: } else {
100: lowestIndex = ptop_scatter->recvLowestIndex;
101: n = ptop_scatter->nr;
102: }
104: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106: gpuPtr = varray + lowestIndex;
107: cpuPtr = s->array + lowestIndex;
109: /* Note : this code copies the smallest contiguous chunk of data
110: containing ALL of the indices */
111: err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112: PetscLogCpuToGpu(n*sizeof(PetscScalar));
114: /* Set the buffer states */
115: v->offloadmask = PETSC_OFFLOAD_BOTH;
116: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117: }
118: return(0);
119: }
122: /*
123: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
124: */
125: PetscErrorCode VecCUDACopyFromGPU(Vec v)
126: {
128: cudaError_t err;
129: Vec_CUDA *veccuda;
130: PetscScalar *varray;
134: VecCUDAAllocateCheckHost(v);
135: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
136: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
137: veccuda = (Vec_CUDA*)v->spptr;
138: varray = veccuda->GPUarray;
139: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
140: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
141: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
142: v->offloadmask = PETSC_OFFLOAD_BOTH;
143: }
144: return(0);
145: }
147: /* Note that this function only copies *some* of the values up from the GPU to CPU,
148: which means that we need recombine the data at some point before using any of the standard functions.
149: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
150: where you have to always call in pairs
151: */
152: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
153: {
154: const PetscScalar *varray, *gpuPtr;
155: PetscErrorCode ierr;
156: cudaError_t err;
157: PetscScalar *cpuPtr;
158: Vec_Seq *s;
159: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
160: PetscInt lowestIndex,n;
164: VecCUDAAllocateCheckHost(v);
165: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
166: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
167: if (mode & SCATTER_REVERSE) {
168: lowestIndex = ptop_scatter->recvLowestIndex;
169: n = ptop_scatter->nr;
170: } else {
171: lowestIndex = ptop_scatter->sendLowestIndex;
172: n = ptop_scatter->ns;
173: }
175: varray=((Vec_CUDA*)v->spptr)->GPUarray;
176: s = (Vec_Seq*)v->data;
177: gpuPtr = varray + lowestIndex;
178: cpuPtr = s->array + lowestIndex;
180: /* Note : this code copies the smallest contiguous chunk of data
181: containing ALL of the indices */
182: err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
183: PetscLogGpuToCpu(n*sizeof(PetscScalar));
185: VecCUDARestoreArrayRead(v,&varray);
186: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
187: }
188: return(0);
189: }
191: /*MC
192: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
194: Options Database Keys:
195: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
197: Level: beginner
199: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
200: M*/
202: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
203: {
204: const PetscScalar *xarray;
205: PetscScalar *yarray;
206: PetscErrorCode ierr;
207: PetscBLASInt one=1,bn;
208: PetscScalar sone=1.0;
209: cublasHandle_t cublasv2handle;
210: cublasStatus_t cberr;
211: cudaError_t err;
214: PetscCUBLASGetHandle(&cublasv2handle);
215: PetscBLASIntCast(yin->map->n,&bn);
216: VecCUDAGetArrayRead(xin,&xarray);
217: VecCUDAGetArray(yin,&yarray);
218: PetscLogGpuTimeBegin();
219: if (alpha == (PetscScalar)0.0) {
220: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
221: } else if (alpha == (PetscScalar)1.0) {
222: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
223: PetscLogGpuFlops(1.0*yin->map->n);
224: } else {
225: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
226: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
227: PetscLogGpuFlops(2.0*yin->map->n);
228: }
229: err = WaitForGPU();CHKERRCUDA(err);
230: PetscLogGpuTimeEnd();
231: VecCUDARestoreArrayRead(xin,&xarray);
232: VecCUDARestoreArray(yin,&yarray);
233: return(0);
234: }
236: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
237: {
238: const PetscScalar *xarray;
239: PetscScalar *yarray;
240: PetscErrorCode ierr;
241: PetscBLASInt one=1,bn;
242: cublasHandle_t cublasv2handle;
243: cublasStatus_t cberr;
244: cudaError_t err;
247: PetscCUBLASGetHandle(&cublasv2handle);
248: if (alpha != (PetscScalar)0.0) {
249: PetscBLASIntCast(yin->map->n,&bn);
250: VecCUDAGetArrayRead(xin,&xarray);
251: VecCUDAGetArray(yin,&yarray);
252: PetscLogGpuTimeBegin();
253: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
254: err = WaitForGPU();CHKERRCUDA(err);
255: PetscLogGpuTimeEnd();
256: VecCUDARestoreArrayRead(xin,&xarray);
257: VecCUDARestoreArray(yin,&yarray);
258: PetscLogGpuFlops(2.0*yin->map->n);
259: }
260: return(0);
261: }
263: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
264: {
265: PetscInt n = xin->map->n;
266: const PetscScalar *xarray=NULL,*yarray=NULL;
267: PetscScalar *warray=NULL;
268: thrust::device_ptr<const PetscScalar> xptr,yptr;
269: thrust::device_ptr<PetscScalar> wptr;
270: PetscErrorCode ierr;
271: cudaError_t err;
274: VecCUDAGetArrayWrite(win,&warray);
275: VecCUDAGetArrayRead(xin,&xarray);
276: VecCUDAGetArrayRead(yin,&yarray);
277: PetscLogGpuTimeBegin();
278: try {
279: wptr = thrust::device_pointer_cast(warray);
280: xptr = thrust::device_pointer_cast(xarray);
281: yptr = thrust::device_pointer_cast(yarray);
282: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
283: err = WaitForGPU();CHKERRCUDA(err);
284: } catch (char *ex) {
285: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
286: }
287: PetscLogGpuTimeEnd();
288: PetscLogGpuFlops(n);
289: VecCUDARestoreArrayRead(xin,&xarray);
290: VecCUDARestoreArrayRead(yin,&yarray);
291: VecCUDARestoreArrayWrite(win,&warray);
292: return(0);
293: }
295: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
296: {
297: const PetscScalar *xarray=NULL,*yarray=NULL;
298: PetscScalar *warray=NULL;
299: PetscErrorCode ierr;
300: PetscBLASInt one=1,bn;
301: cublasHandle_t cublasv2handle;
302: cublasStatus_t cberr;
303: cudaError_t err;
306: PetscCUBLASGetHandle(&cublasv2handle);
307: PetscBLASIntCast(win->map->n,&bn);
308: if (alpha == (PetscScalar)0.0) {
309: VecCopy_SeqCUDA(yin,win);
310: } else {
311: VecCUDAGetArrayRead(xin,&xarray);
312: VecCUDAGetArrayRead(yin,&yarray);
313: VecCUDAGetArrayWrite(win,&warray);
314: PetscLogGpuTimeBegin();
315: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
316: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
317: err = WaitForGPU();CHKERRCUDA(err);
318: PetscLogGpuTimeEnd();
319: PetscLogGpuFlops(2*win->map->n);
320: VecCUDARestoreArrayRead(xin,&xarray);
321: VecCUDARestoreArrayRead(yin,&yarray);
322: VecCUDARestoreArrayWrite(win,&warray);
323: }
324: return(0);
325: }
327: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
328: {
330: cudaError_t err;
331: PetscInt n = xin->map->n,j,j_rem;
332: PetscScalar alpha0,alpha1,alpha2,alpha3;
335: PetscLogGpuFlops(nv*2.0*n);
336: switch (j_rem=nv&0x3) {
337: case 3:
338: alpha0 = alpha[0];
339: alpha1 = alpha[1];
340: alpha2 = alpha[2];
341: alpha += 3;
342: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
343: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
344: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
345: y += 3;
346: break;
347: case 2:
348: alpha0 = alpha[0];
349: alpha1 = alpha[1];
350: alpha +=2;
351: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
352: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
353: y +=2;
354: break;
355: case 1:
356: alpha0 = *alpha++;
357: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
358: y +=1;
359: break;
360: }
361: for (j=j_rem; j<nv; j+=4) {
362: alpha0 = alpha[0];
363: alpha1 = alpha[1];
364: alpha2 = alpha[2];
365: alpha3 = alpha[3];
366: alpha += 4;
367: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
368: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
369: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
370: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
371: y += 4;
372: }
373: err = WaitForGPU();CHKERRCUDA(err);
374: return(0);
375: }
377: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
378: {
379: const PetscScalar *xarray,*yarray;
380: PetscErrorCode ierr;
381: PetscBLASInt one=1,bn;
382: cublasHandle_t cublasv2handle;
383: cublasStatus_t cerr;
386: PetscCUBLASGetHandle(&cublasv2handle);
387: PetscBLASIntCast(yin->map->n,&bn);
388: VecCUDAGetArrayRead(xin,&xarray);
389: VecCUDAGetArrayRead(yin,&yarray);
390: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
391: PetscLogGpuTimeBegin();
392: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
393: PetscLogGpuTimeEnd();
394: if (xin->map->n >0) {
395: PetscLogGpuFlops(2.0*xin->map->n-1);
396: }
397: VecCUDARestoreArrayRead(xin,&xarray);
398: VecCUDARestoreArrayRead(yin,&yarray);
399: return(0);
400: }
402: //
403: // CUDA kernels for MDot to follow
404: //
406: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
407: #define MDOT_WORKGROUP_SIZE 128
408: #define MDOT_WORKGROUP_NUM 128
410: #if !defined(PETSC_USE_COMPLEX)
411: // M = 2:
412: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
413: PetscInt size, PetscScalar *group_results)
414: {
415: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
416: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
417: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
418: PetscInt vec_start_index = blockIdx.x * entries_per_group;
419: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
421: PetscScalar entry_x = 0;
422: PetscScalar group_sum0 = 0;
423: PetscScalar group_sum1 = 0;
424: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
425: entry_x = x[i]; // load only once from global memory!
426: group_sum0 += entry_x * y0[i];
427: group_sum1 += entry_x * y1[i];
428: }
429: tmp_buffer[threadIdx.x] = group_sum0;
430: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
432: // parallel reduction
433: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
434: __syncthreads();
435: if (threadIdx.x < stride) {
436: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
437: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
438: }
439: }
441: // write result of group to group_results
442: if (threadIdx.x == 0) {
443: group_results[blockIdx.x] = tmp_buffer[0];
444: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
445: }
446: }
448: // M = 3:
449: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
450: PetscInt size, PetscScalar *group_results)
451: {
452: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
453: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
454: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
455: PetscInt vec_start_index = blockIdx.x * entries_per_group;
456: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
458: PetscScalar entry_x = 0;
459: PetscScalar group_sum0 = 0;
460: PetscScalar group_sum1 = 0;
461: PetscScalar group_sum2 = 0;
462: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463: entry_x = x[i]; // load only once from global memory!
464: group_sum0 += entry_x * y0[i];
465: group_sum1 += entry_x * y1[i];
466: group_sum2 += entry_x * y2[i];
467: }
468: tmp_buffer[threadIdx.x] = group_sum0;
469: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
470: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
472: // parallel reduction
473: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
474: __syncthreads();
475: if (threadIdx.x < stride) {
476: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
477: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
478: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
479: }
480: }
482: // write result of group to group_results
483: if (threadIdx.x == 0) {
484: group_results[blockIdx.x ] = tmp_buffer[0];
485: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
486: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
487: }
488: }
490: // M = 4:
491: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
492: PetscInt size, PetscScalar *group_results)
493: {
494: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
495: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
496: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
497: PetscInt vec_start_index = blockIdx.x * entries_per_group;
498: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
500: PetscScalar entry_x = 0;
501: PetscScalar group_sum0 = 0;
502: PetscScalar group_sum1 = 0;
503: PetscScalar group_sum2 = 0;
504: PetscScalar group_sum3 = 0;
505: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
506: entry_x = x[i]; // load only once from global memory!
507: group_sum0 += entry_x * y0[i];
508: group_sum1 += entry_x * y1[i];
509: group_sum2 += entry_x * y2[i];
510: group_sum3 += entry_x * y3[i];
511: }
512: tmp_buffer[threadIdx.x] = group_sum0;
513: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
514: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
515: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
517: // parallel reduction
518: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
519: __syncthreads();
520: if (threadIdx.x < stride) {
521: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
522: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
523: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
524: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
525: }
526: }
528: // write result of group to group_results
529: if (threadIdx.x == 0) {
530: group_results[blockIdx.x ] = tmp_buffer[0];
531: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
532: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
533: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
534: }
535: }
537: // M = 8:
538: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
539: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
540: PetscInt size, PetscScalar *group_results)
541: {
542: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
543: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
544: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
545: PetscInt vec_start_index = blockIdx.x * entries_per_group;
546: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
548: PetscScalar entry_x = 0;
549: PetscScalar group_sum0 = 0;
550: PetscScalar group_sum1 = 0;
551: PetscScalar group_sum2 = 0;
552: PetscScalar group_sum3 = 0;
553: PetscScalar group_sum4 = 0;
554: PetscScalar group_sum5 = 0;
555: PetscScalar group_sum6 = 0;
556: PetscScalar group_sum7 = 0;
557: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
558: entry_x = x[i]; // load only once from global memory!
559: group_sum0 += entry_x * y0[i];
560: group_sum1 += entry_x * y1[i];
561: group_sum2 += entry_x * y2[i];
562: group_sum3 += entry_x * y3[i];
563: group_sum4 += entry_x * y4[i];
564: group_sum5 += entry_x * y5[i];
565: group_sum6 += entry_x * y6[i];
566: group_sum7 += entry_x * y7[i];
567: }
568: tmp_buffer[threadIdx.x] = group_sum0;
569: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
570: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
571: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
572: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
573: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
574: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
575: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
577: // parallel reduction
578: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
579: __syncthreads();
580: if (threadIdx.x < stride) {
581: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
582: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
583: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
584: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
585: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
586: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
587: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
588: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
589: }
590: }
592: // write result of group to group_results
593: if (threadIdx.x == 0) {
594: group_results[blockIdx.x ] = tmp_buffer[0];
595: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
596: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
597: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
598: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
599: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
600: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
601: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
602: }
603: }
604: #endif /* !defined(PETSC_USE_COMPLEX) */
606: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
607: {
608: PetscErrorCode ierr;
609: PetscInt i,n = xin->map->n,current_y_index = 0;
610: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
611: PetscScalar *group_results_gpu;
612: #if !defined(PETSC_USE_COMPLEX)
613: PetscInt j;
614: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
615: #endif
616: cudaError_t cuda_ierr;
617: PetscBLASInt one=1,bn;
618: cublasHandle_t cublasv2handle;
619: cublasStatus_t cberr;
622: PetscCUBLASGetHandle(&cublasv2handle);
623: PetscBLASIntCast(xin->map->n,&bn);
624: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
625: /* Handle the case of local size zero first */
626: if (!xin->map->n) {
627: for (i=0; i<nv; ++i) z[i] = 0;
628: return(0);
629: }
631: // allocate scratchpad memory for the results of individual work groups:
632: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
634: VecCUDAGetArrayRead(xin,&xptr);
636: while (current_y_index < nv)
637: {
638: switch (nv - current_y_index) {
640: case 7:
641: case 6:
642: case 5:
643: case 4:
644: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
645: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
646: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
647: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
648: PetscLogGpuTimeBegin();
649: #if defined(PETSC_USE_COMPLEX)
650: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
651: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
652: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
653: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
654: #else
655: // run kernel:
656: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
657: PetscLogGpuTimeEnd();
659: // copy results back to
660: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
662: // sum group results into z:
663: for (j=0; j<4; ++j) {
664: z[current_y_index + j] = 0;
665: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
666: }
667: #endif
668: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
669: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
670: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
671: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
672: current_y_index += 4;
673: break;
675: case 3:
676: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
677: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
678: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
680: PetscLogGpuTimeBegin();
681: #if defined(PETSC_USE_COMPLEX)
682: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
683: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
684: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
685: #else
686: // run kernel:
687: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
688: PetscLogGpuTimeEnd();
690: // copy results back to
691: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
693: // sum group results into z:
694: for (j=0; j<3; ++j) {
695: z[current_y_index + j] = 0;
696: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
697: }
698: #endif
699: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
700: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
701: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
702: current_y_index += 3;
703: break;
705: case 2:
706: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
707: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
708: PetscLogGpuTimeBegin();
709: #if defined(PETSC_USE_COMPLEX)
710: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
711: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
712: #else
713: // run kernel:
714: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
715: PetscLogGpuTimeEnd();
717: // copy results back to
718: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
720: // sum group results into z:
721: for (j=0; j<2; ++j) {
722: z[current_y_index + j] = 0;
723: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
724: }
725: #endif
726: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
727: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
728: current_y_index += 2;
729: break;
731: case 1:
732: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
733: PetscLogGpuTimeBegin();
734: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
735: PetscLogGpuTimeEnd();
736: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
737: current_y_index += 1;
738: break;
740: default: // 8 or more vectors left
741: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
742: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
743: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
744: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
745: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
746: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
747: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
748: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
749: PetscLogGpuTimeBegin();
750: #if defined(PETSC_USE_COMPLEX)
751: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
752: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
753: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
754: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
755: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
756: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
757: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
758: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
759: #else
760: // run kernel:
761: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
762: PetscLogGpuTimeEnd();
764: // copy results back to
765: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
767: // sum group results into z:
768: for (j=0; j<8; ++j) {
769: z[current_y_index + j] = 0;
770: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
771: }
772: #endif
773: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
774: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
775: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
776: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
777: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
778: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
779: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
780: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
781: current_y_index += 8;
782: break;
783: }
784: }
785: VecCUDARestoreArrayRead(xin,&xptr);
787: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
788: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
789: return(0);
790: }
792: #undef MDOT_WORKGROUP_SIZE
793: #undef MDOT_WORKGROUP_NUM
795: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
796: {
797: PetscInt n = xin->map->n;
798: PetscScalar *xarray=NULL;
799: thrust::device_ptr<PetscScalar> xptr;
800: PetscErrorCode ierr;
801: cudaError_t err;
804: VecCUDAGetArrayWrite(xin,&xarray);
805: PetscLogGpuTimeBegin();
806: if (alpha == (PetscScalar)0.0) {
807: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
808: } else {
809: try {
810: xptr = thrust::device_pointer_cast(xarray);
811: thrust::fill(xptr,xptr+n,alpha);
812: } catch (char *ex) {
813: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
814: }
815: }
816: err = WaitForGPU();CHKERRCUDA(err);
817: PetscLogGpuTimeEnd();
818: VecCUDARestoreArrayWrite(xin,&xarray);
819: return(0);
820: }
822: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
823: {
824: PetscScalar *xarray;
826: PetscBLASInt one=1,bn;
827: cublasHandle_t cublasv2handle;
828: cublasStatus_t cberr;
829: cudaError_t err;
832: if (alpha == (PetscScalar)0.0) {
833: VecSet_SeqCUDA(xin,alpha);
834: err = WaitForGPU();CHKERRCUDA(err);
835: } else if (alpha != (PetscScalar)1.0) {
836: PetscCUBLASGetHandle(&cublasv2handle);
837: PetscBLASIntCast(xin->map->n,&bn);
838: VecCUDAGetArray(xin,&xarray);
839: PetscLogGpuTimeBegin();
840: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
841: VecCUDARestoreArray(xin,&xarray);
842: err = WaitForGPU();CHKERRCUDA(err);
843: PetscLogGpuTimeEnd();
844: }
845: PetscLogGpuFlops(xin->map->n);
846: return(0);
847: }
849: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
850: {
851: const PetscScalar *xarray,*yarray;
852: PetscErrorCode ierr;
853: PetscBLASInt one=1,bn;
854: cublasHandle_t cublasv2handle;
855: cublasStatus_t cerr;
858: PetscCUBLASGetHandle(&cublasv2handle);
859: PetscBLASIntCast(xin->map->n,&bn);
860: VecCUDAGetArrayRead(xin,&xarray);
861: VecCUDAGetArrayRead(yin,&yarray);
862: PetscLogGpuTimeBegin();
863: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
864: PetscLogGpuTimeEnd();
865: if (xin->map->n > 0) {
866: PetscLogGpuFlops(2.0*xin->map->n-1);
867: }
868: VecCUDARestoreArrayRead(yin,&yarray);
869: VecCUDARestoreArrayRead(xin,&xarray);
870: return(0);
871: }
873: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
874: {
875: const PetscScalar *xarray;
876: PetscScalar *yarray;
877: PetscErrorCode ierr;
878: cudaError_t err;
881: if (xin != yin) {
882: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
883: PetscBool yiscuda;
885: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
886: VecCUDAGetArrayRead(xin,&xarray);
887: if (yiscuda) {
888: VecCUDAGetArrayWrite(yin,&yarray);
889: } else {
890: VecGetArrayWrite(yin,&yarray);
891: }
892: PetscLogGpuTimeBegin();
893: if (yiscuda) {
894: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
895: } else {
896: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
897: }
898: err = WaitForGPU();CHKERRCUDA(err);
899: PetscLogGpuTimeEnd();
900: VecCUDARestoreArrayRead(xin,&xarray);
901: if (yiscuda) {
902: VecCUDARestoreArrayWrite(yin,&yarray);
903: } else {
904: VecRestoreArrayWrite(yin,&yarray);
905: }
906: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
907: /* copy in CPU if we are on the CPU */
908: VecCopy_SeqCUDA_Private(xin,yin);
909: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
910: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
911: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
912: /* copy in CPU */
913: VecCopy_SeqCUDA_Private(xin,yin);
914: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
915: /* copy in GPU */
916: VecCUDAGetArrayRead(xin,&xarray);
917: VecCUDAGetArrayWrite(yin,&yarray);
918: PetscLogGpuTimeBegin();
919: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
920: PetscLogGpuTimeEnd();
921: VecCUDARestoreArrayRead(xin,&xarray);
922: VecCUDARestoreArrayWrite(yin,&yarray);
923: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
924: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
925: default to copy in GPU (this is an arbitrary choice) */
926: VecCUDAGetArrayRead(xin,&xarray);
927: VecCUDAGetArrayWrite(yin,&yarray);
928: PetscLogGpuTimeBegin();
929: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
930: PetscLogGpuTimeEnd();
931: VecCUDARestoreArrayRead(xin,&xarray);
932: VecCUDARestoreArrayWrite(yin,&yarray);
933: } else {
934: VecCopy_SeqCUDA_Private(xin,yin);
935: }
936: }
937: }
938: return(0);
939: }
941: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
942: {
944: PetscBLASInt one = 1,bn;
945: PetscScalar *xarray,*yarray;
946: cublasHandle_t cublasv2handle;
947: cublasStatus_t cberr;
948: cudaError_t err;
951: PetscCUBLASGetHandle(&cublasv2handle);
952: PetscBLASIntCast(xin->map->n,&bn);
953: if (xin != yin) {
954: VecCUDAGetArray(xin,&xarray);
955: VecCUDAGetArray(yin,&yarray);
956: PetscLogGpuTimeBegin();
957: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
958: err = WaitForGPU();CHKERRCUDA(err);
959: PetscLogGpuTimeEnd();
960: VecCUDARestoreArray(xin,&xarray);
961: VecCUDARestoreArray(yin,&yarray);
962: }
963: return(0);
964: }
966: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
967: {
968: PetscErrorCode ierr;
969: PetscScalar a = alpha,b = beta;
970: const PetscScalar *xarray;
971: PetscScalar *yarray;
972: PetscBLASInt one = 1, bn;
973: cublasHandle_t cublasv2handle;
974: cublasStatus_t cberr;
975: cudaError_t err;
978: PetscCUBLASGetHandle(&cublasv2handle);
979: PetscBLASIntCast(yin->map->n,&bn);
980: if (a == (PetscScalar)0.0) {
981: VecScale_SeqCUDA(yin,beta);
982: } else if (b == (PetscScalar)1.0) {
983: VecAXPY_SeqCUDA(yin,alpha,xin);
984: } else if (a == (PetscScalar)1.0) {
985: VecAYPX_SeqCUDA(yin,beta,xin);
986: } else if (b == (PetscScalar)0.0) {
987: VecCUDAGetArrayRead(xin,&xarray);
988: VecCUDAGetArray(yin,&yarray);
989: PetscLogGpuTimeBegin();
990: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
991: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
992: err = WaitForGPU();CHKERRCUDA(err);
993: PetscLogGpuTimeEnd();
994: PetscLogGpuFlops(xin->map->n);
995: VecCUDARestoreArrayRead(xin,&xarray);
996: VecCUDARestoreArray(yin,&yarray);
997: } else {
998: VecCUDAGetArrayRead(xin,&xarray);
999: VecCUDAGetArray(yin,&yarray);
1000: PetscLogGpuTimeBegin();
1001: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1002: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1003: VecCUDARestoreArrayRead(xin,&xarray);
1004: VecCUDARestoreArray(yin,&yarray);
1005: err = WaitForGPU();CHKERRCUDA(err);
1006: PetscLogGpuTimeEnd();
1007: PetscLogGpuFlops(3.0*xin->map->n);
1008: }
1009: return(0);
1010: }
1012: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1013: {
1015: cudaError_t err;
1016: PetscInt n = zin->map->n;
1019: if (gamma == (PetscScalar)1.0) {
1020: /* z = ax + b*y + z */
1021: VecAXPY_SeqCUDA(zin,alpha,xin);
1022: VecAXPY_SeqCUDA(zin,beta,yin);
1023: PetscLogGpuFlops(4.0*n);
1024: } else {
1025: /* z = a*x + b*y + c*z */
1026: VecScale_SeqCUDA(zin,gamma);
1027: VecAXPY_SeqCUDA(zin,alpha,xin);
1028: VecAXPY_SeqCUDA(zin,beta,yin);
1029: PetscLogGpuFlops(5.0*n);
1030: }
1031: err = WaitForGPU();CHKERRCUDA(err);
1032: return(0);
1033: }
1035: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1036: {
1037: PetscInt n = win->map->n;
1038: const PetscScalar *xarray,*yarray;
1039: PetscScalar *warray;
1040: thrust::device_ptr<const PetscScalar> xptr,yptr;
1041: thrust::device_ptr<PetscScalar> wptr;
1042: PetscErrorCode ierr;
1043: cudaError_t err;
1046: VecCUDAGetArray(win,&warray);
1047: VecCUDAGetArrayRead(xin,&xarray);
1048: VecCUDAGetArrayRead(yin,&yarray);
1049: PetscLogGpuTimeBegin();
1050: try {
1051: wptr = thrust::device_pointer_cast(warray);
1052: xptr = thrust::device_pointer_cast(xarray);
1053: yptr = thrust::device_pointer_cast(yarray);
1054: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1055: err = WaitForGPU();CHKERRCUDA(err);
1056: } catch (char *ex) {
1057: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1058: }
1059: PetscLogGpuTimeEnd();
1060: VecCUDARestoreArrayRead(xin,&xarray);
1061: VecCUDARestoreArrayRead(yin,&yarray);
1062: VecCUDARestoreArray(win,&warray);
1063: PetscLogGpuFlops(n);
1064: return(0);
1065: }
1067: /* should do infinity norm in cuda */
1069: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1070: {
1071: PetscErrorCode ierr;
1072: PetscInt n = xin->map->n;
1073: PetscBLASInt one = 1, bn;
1074: const PetscScalar *xarray;
1075: cublasHandle_t cublasv2handle;
1076: cublasStatus_t cberr;
1077: cudaError_t err;
1080: PetscCUBLASGetHandle(&cublasv2handle);
1081: PetscBLASIntCast(n,&bn);
1082: if (type == NORM_2 || type == NORM_FROBENIUS) {
1083: VecCUDAGetArrayRead(xin,&xarray);
1084: PetscLogGpuTimeBegin();
1085: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1086: PetscLogGpuTimeEnd();
1087: VecCUDARestoreArrayRead(xin,&xarray);
1088: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1089: } else if (type == NORM_INFINITY) {
1090: int i;
1091: VecCUDAGetArrayRead(xin,&xarray);
1092: PetscLogGpuTimeBegin();
1093: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1094: PetscLogGpuTimeEnd();
1095: if (bn) {
1096: PetscScalar zs;
1097: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1098: *z = PetscAbsScalar(zs);
1099: } else *z = 0.0;
1100: VecCUDARestoreArrayRead(xin,&xarray);
1101: } else if (type == NORM_1) {
1102: VecCUDAGetArrayRead(xin,&xarray);
1103: PetscLogGpuTimeBegin();
1104: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1105: VecCUDARestoreArrayRead(xin,&xarray);
1106: PetscLogGpuTimeEnd();
1107: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1108: } else if (type == NORM_1_AND_2) {
1109: VecNorm_SeqCUDA(xin,NORM_1,z);
1110: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1111: }
1112: return(0);
1113: }
1115: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1116: {
1117: PetscErrorCode ierr;
1118: cudaError_t err;
1119: PetscReal n=s->map->n;
1120: const PetscScalar *sarray,*tarray;
1123: VecCUDAGetArrayRead(s,&sarray);
1124: VecCUDAGetArrayRead(t,&tarray);
1125: VecDot_SeqCUDA(s,t,dp);
1126: VecDot_SeqCUDA(t,t,nm);
1127: VecCUDARestoreArrayRead(s,&sarray);
1128: VecCUDARestoreArrayRead(t,&tarray);
1129: err = WaitForGPU();CHKERRCUDA(err);
1130: PetscLogGpuFlops(4.0*n);
1131: return(0);
1132: }
1134: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1135: {
1137: cudaError_t err;
1140: if (v->spptr) {
1141: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1142: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1143: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1144: }
1145: if (((Vec_CUDA*)v->spptr)->stream) {
1146: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1147: }
1148: }
1149: VecDestroy_SeqCUDA_Private(v);
1150: PetscFree(v->spptr);
1151: return(0);
1152: }
1154: #if defined(PETSC_USE_COMPLEX)
1155: struct conjugate
1156: {
1157: __host__ __device__
1158: PetscScalar operator()(PetscScalar x)
1159: {
1160: return PetscConj(x);
1161: }
1162: };
1163: #endif
1165: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1166: {
1167: PetscScalar *xarray;
1168: PetscErrorCode ierr;
1169: #if defined(PETSC_USE_COMPLEX)
1170: PetscInt n = xin->map->n;
1171: thrust::device_ptr<PetscScalar> xptr;
1172: cudaError_t err;
1173: #endif
1176: VecCUDAGetArray(xin,&xarray);
1177: #if defined(PETSC_USE_COMPLEX)
1178: PetscLogGpuTimeBegin();
1179: try {
1180: xptr = thrust::device_pointer_cast(xarray);
1181: thrust::transform(xptr,xptr+n,xptr,conjugate());
1182: err = WaitForGPU();CHKERRCUDA(err);
1183: } catch (char *ex) {
1184: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1185: }
1186: PetscLogGpuTimeEnd();
1187: #endif
1188: VecCUDARestoreArray(xin,&xarray);
1189: return(0);
1190: }
1192: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1193: {
1195: cudaError_t err;
1202: if (w->data) {
1203: if (((Vec_Seq*)w->data)->array_allocated) {
1204: if (w->pinned_memory) {
1205: PetscMallocSetCUDAHost();
1206: }
1207: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1208: if (w->pinned_memory) {
1209: PetscMallocResetCUDAHost();
1210: w->pinned_memory = PETSC_FALSE;
1211: }
1212: }
1213: ((Vec_Seq*)w->data)->array = NULL;
1214: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1215: }
1216: if (w->spptr) {
1217: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1218: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1219: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1220: }
1221: if (((Vec_CUDA*)v->spptr)->stream) {
1222: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1223: }
1224: PetscFree(w->spptr);
1225: }
1227: if (v->petscnative) {
1228: PetscFree(w->data);
1229: w->data = v->data;
1230: w->offloadmask = v->offloadmask;
1231: w->pinned_memory = v->pinned_memory;
1232: w->spptr = v->spptr;
1233: PetscObjectStateIncrease((PetscObject)w);
1234: } else {
1235: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1236: w->offloadmask = PETSC_OFFLOAD_CPU;
1237: VecCUDAAllocateCheck(w);
1238: }
1239: return(0);
1240: }
1242: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1243: {
1245: cudaError_t err;
1252: if (v->petscnative) {
1253: v->data = w->data;
1254: v->offloadmask = w->offloadmask;
1255: v->pinned_memory = w->pinned_memory;
1256: v->spptr = w->spptr;
1257: VecCUDACopyFromGPU(v);
1258: PetscObjectStateIncrease((PetscObject)v);
1259: w->data = 0;
1260: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1261: w->spptr = 0;
1262: } else {
1263: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1264: if ((Vec_CUDA*)w->spptr) {
1265: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1266: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1267: if (((Vec_CUDA*)v->spptr)->stream) {
1268: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1269: }
1270: PetscFree(w->spptr);
1271: }
1272: }
1273: return(0);
1274: }