Actual source code: veccuda2.cu
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/functional.h>
15: #include <thrust/iterator/counting_iterator.h>
16: #include <thrust/reduce.h>
17: #include <thrust/transform.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/transform_reduce.h>
20: #endif
22: #if THRUST_VERSION >= 101600 && !PetscDefined(USE_DEBUG)
23: static thrust::cuda_cub::par_nosync_t::stream_attachment_type VecCUDAThrustPolicy(Vec x) {
24: return thrust::cuda::par_nosync.on(((Vec_CUDA*)x->spptr)->stream);
25: }
26: #else
27: static thrust::cuda_cub::par_t::stream_attachment_type VecCUDAThrustPolicy(Vec x) {
28: return thrust::cuda::par.on(((Vec_CUDA*)x->spptr)->stream);
29: }
30: #endif
32: /*
33: Allocates space for the vector array on the GPU if it does not exist.
34: Does NOT change the PetscCUDAFlag for the vector
35: Does NOT zero the CUDA array
37: */
38: PetscErrorCode VecCUDAAllocateCheck(Vec v)
39: {
41: Vec_CUDA *veccuda;
42: PetscBool option_set;
44: if (!v->spptr) {
45: PetscReal pinned_memory_min;
46: PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
47: veccuda = (Vec_CUDA*)v->spptr;
48: cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));
49: veccuda->GPUarray = veccuda->GPUarray_allocated;
50: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
51: if (v->data && ((Vec_Seq*)v->data)->array) {
52: v->offloadmask = PETSC_OFFLOAD_CPU;
53: } else {
54: v->offloadmask = PETSC_OFFLOAD_GPU;
55: }
56: }
57: pinned_memory_min = 0;
59: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
60: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
61: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
62: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
63: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
64: PetscOptionsEnd();
65: }
66: return 0;
67: }
69: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
70: PetscErrorCode VecCUDACopyToGPU(Vec v)
71: {
72: Vec_CUDA *veccuda;
73: PetscScalar *varray;
76: VecCUDAAllocateCheck(v);
77: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
78: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
79: veccuda = (Vec_CUDA*)v->spptr;
80: varray = veccuda->GPUarray;
81: cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
82: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
83: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
84: v->offloadmask = PETSC_OFFLOAD_BOTH;
85: }
86: return 0;
87: }
89: /*
90: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
91: */
92: PetscErrorCode VecCUDACopyFromGPU(Vec v)
93: {
94: Vec_CUDA *veccuda;
95: PetscScalar *varray;
98: VecCUDAAllocateCheckHost(v);
99: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
100: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
101: veccuda = (Vec_CUDA*)v->spptr;
102: varray = veccuda->GPUarray;
103: cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
104: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
105: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
106: v->offloadmask = PETSC_OFFLOAD_BOTH;
107: }
108: return 0;
109: }
111: /*MC
112: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
114: Options Database Keys:
115: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
117: Level: beginner
119: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
120: M*/
122: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
123: {
124: const PetscScalar *xarray;
125: PetscScalar *yarray;
126: PetscBLASInt one = 1,bn = 0;
127: PetscScalar sone = 1.0;
128: cublasHandle_t cublasv2handle;
130: PetscCUBLASGetHandle(&cublasv2handle);
131: PetscBLASIntCast(yin->map->n,&bn);
132: VecCUDAGetArrayRead(xin,&xarray);
133: VecCUDAGetArray(yin,&yarray);
134: PetscLogGpuTimeBegin();
135: if (alpha == (PetscScalar)0.0) {
136: cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
137: } else if (alpha == (PetscScalar)1.0) {
138: cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
139: PetscLogGpuFlops(1.0*yin->map->n);
140: } else {
141: cublasXscal(cublasv2handle,bn,&alpha,yarray,one);
142: cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);
143: PetscLogGpuFlops(2.0*yin->map->n);
144: }
145: PetscLogGpuTimeEnd();
146: VecCUDARestoreArrayRead(xin,&xarray);
147: VecCUDARestoreArray(yin,&yarray);
148: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
149: return 0;
150: }
152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154: const PetscScalar *xarray;
155: PetscScalar *yarray;
156: PetscBLASInt one = 1,bn = 0;
157: cublasHandle_t cublasv2handle;
158: PetscBool xiscuda;
160: if (alpha == (PetscScalar)0.0) return 0;
161: PetscCUBLASGetHandle(&cublasv2handle);
162: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
163: if (xiscuda) {
164: PetscBLASIntCast(yin->map->n,&bn);
165: VecCUDAGetArrayRead(xin,&xarray);
166: VecCUDAGetArray(yin,&yarray);
167: PetscLogGpuTimeBegin();
168: cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
169: PetscLogGpuTimeEnd();
170: VecCUDARestoreArrayRead(xin,&xarray);
171: VecCUDARestoreArray(yin,&yarray);
172: PetscLogGpuFlops(2.0*yin->map->n);
173: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
174: } else {
175: VecAXPY_Seq(yin,alpha,xin);
176: }
177: return 0;
178: }
180: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
181: {
182: PetscInt n = xin->map->n;
183: const PetscScalar *xarray=NULL,*yarray=NULL;
184: PetscScalar *warray=NULL;
185: thrust::device_ptr<const PetscScalar> xptr,yptr;
186: thrust::device_ptr<PetscScalar> wptr;
188: if (xin->boundtocpu || yin->boundtocpu) {
189: VecPointwiseDivide_Seq(win,xin,yin);
190: return 0;
191: }
192: VecCUDAGetArrayWrite(win,&warray);
193: VecCUDAGetArrayRead(xin,&xarray);
194: VecCUDAGetArrayRead(yin,&yarray);
195: PetscLogGpuTimeBegin();
196: try {
197: wptr = thrust::device_pointer_cast(warray);
198: xptr = thrust::device_pointer_cast(xarray);
199: yptr = thrust::device_pointer_cast(yarray);
200: thrust::transform(VecCUDAThrustPolicy(win),xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
201: } catch (char *ex) {
202: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
203: }
204: PetscLogGpuTimeEnd();
205: PetscLogGpuFlops(n);
206: VecCUDARestoreArrayRead(xin,&xarray);
207: VecCUDARestoreArrayRead(yin,&yarray);
208: VecCUDARestoreArrayWrite(win,&warray);
209: return 0;
210: }
212: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
213: {
214: const PetscScalar *xarray=NULL,*yarray=NULL;
215: PetscScalar *warray=NULL;
216: PetscBLASInt one = 1,bn = 0;
217: cublasHandle_t cublasv2handle;
218: cudaStream_t stream;
220: PetscCUBLASGetHandle(&cublasv2handle);
221: PetscBLASIntCast(win->map->n,&bn);
222: if (alpha == (PetscScalar)0.0) {
223: VecCopy_SeqCUDA(yin,win);
224: } else {
225: VecCUDAGetArrayRead(xin,&xarray);
226: VecCUDAGetArrayRead(yin,&yarray);
227: VecCUDAGetArrayWrite(win,&warray);
228: PetscLogGpuTimeBegin();
229: cublasGetStream(cublasv2handle,&stream);
230: cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);
231: cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);
232: PetscLogGpuTimeEnd();
233: PetscLogGpuFlops(2*win->map->n);
234: VecCUDARestoreArrayRead(xin,&xarray);
235: VecCUDARestoreArrayRead(yin,&yarray);
236: VecCUDARestoreArrayWrite(win,&warray);
237: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
238: }
239: return 0;
240: }
242: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
243: {
244: PetscInt n = xin->map->n,j;
245: PetscScalar *xarray;
246: const PetscScalar *yarray;
247: PetscBLASInt one = 1,bn = 0;
248: cublasHandle_t cublasv2handle;
250: PetscLogGpuFlops(nv*2.0*n);
251: PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
252: PetscCUBLASGetHandle(&cublasv2handle);
253: PetscBLASIntCast(n,&bn);
254: VecCUDAGetArray(xin,&xarray);
255: PetscLogGpuTimeBegin();
256: for (j=0; j<nv; j++) {
257: VecCUDAGetArrayRead(y[j],&yarray);
258: cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);
259: VecCUDARestoreArrayRead(y[j],&yarray);
260: }
261: PetscLogGpuTimeEnd();
262: VecCUDARestoreArray(xin,&xarray);
263: return 0;
264: }
266: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
267: {
268: const PetscScalar *xarray,*yarray;
269: PetscBLASInt one = 1,bn = 0;
270: cublasHandle_t cublasv2handle;
272: PetscCUBLASGetHandle(&cublasv2handle);
273: PetscBLASIntCast(yin->map->n,&bn);
274: VecCUDAGetArrayRead(xin,&xarray);
275: VecCUDAGetArrayRead(yin,&yarray);
276: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
277: PetscLogGpuTimeBegin();
278: cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);
279: PetscLogGpuTimeEnd();
280: if (xin->map->n >0) {
281: PetscLogGpuFlops(2.0*xin->map->n-1);
282: }
283: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
284: VecCUDARestoreArrayRead(xin,&xarray);
285: VecCUDARestoreArrayRead(yin,&yarray);
286: return 0;
287: }
289: //
290: // CUDA kernels for MDot to follow
291: //
293: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
294: #define MDOT_WORKGROUP_SIZE 128
295: #define MDOT_WORKGROUP_NUM 128
297: #if !defined(PETSC_USE_COMPLEX)
298: // M = 2:
299: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
300: PetscInt size, PetscScalar *group_results)
301: {
302: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
303: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
304: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
305: PetscInt vec_start_index = blockIdx.x * entries_per_group;
306: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
308: PetscScalar entry_x = 0;
309: PetscScalar group_sum0 = 0;
310: PetscScalar group_sum1 = 0;
311: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
312: entry_x = x[i]; // load only once from global memory!
313: group_sum0 += entry_x * y0[i];
314: group_sum1 += entry_x * y1[i];
315: }
316: tmp_buffer[threadIdx.x] = group_sum0;
317: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
319: // parallel reduction
320: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
321: __syncthreads();
322: if (threadIdx.x < stride) {
323: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
324: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
325: }
326: }
328: // write result of group to group_results
329: if (threadIdx.x == 0) {
330: group_results[blockIdx.x] = tmp_buffer[0];
331: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
332: }
333: }
335: // M = 3:
336: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
337: PetscInt size, PetscScalar *group_results)
338: {
339: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
340: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
341: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
342: PetscInt vec_start_index = blockIdx.x * entries_per_group;
343: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
345: PetscScalar entry_x = 0;
346: PetscScalar group_sum0 = 0;
347: PetscScalar group_sum1 = 0;
348: PetscScalar group_sum2 = 0;
349: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
350: entry_x = x[i]; // load only once from global memory!
351: group_sum0 += entry_x * y0[i];
352: group_sum1 += entry_x * y1[i];
353: group_sum2 += entry_x * y2[i];
354: }
355: tmp_buffer[threadIdx.x] = group_sum0;
356: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
357: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
359: // parallel reduction
360: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
361: __syncthreads();
362: if (threadIdx.x < stride) {
363: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
364: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
365: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
366: }
367: }
369: // write result of group to group_results
370: if (threadIdx.x == 0) {
371: group_results[blockIdx.x ] = tmp_buffer[0];
372: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
373: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
374: }
375: }
377: // M = 4:
378: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
379: PetscInt size, PetscScalar *group_results)
380: {
381: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
382: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
383: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
384: PetscInt vec_start_index = blockIdx.x * entries_per_group;
385: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
387: PetscScalar entry_x = 0;
388: PetscScalar group_sum0 = 0;
389: PetscScalar group_sum1 = 0;
390: PetscScalar group_sum2 = 0;
391: PetscScalar group_sum3 = 0;
392: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
393: entry_x = x[i]; // load only once from global memory!
394: group_sum0 += entry_x * y0[i];
395: group_sum1 += entry_x * y1[i];
396: group_sum2 += entry_x * y2[i];
397: group_sum3 += entry_x * y3[i];
398: }
399: tmp_buffer[threadIdx.x] = group_sum0;
400: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
401: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
402: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
404: // parallel reduction
405: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
406: __syncthreads();
407: if (threadIdx.x < stride) {
408: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
409: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
410: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
411: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
412: }
413: }
415: // write result of group to group_results
416: if (threadIdx.x == 0) {
417: group_results[blockIdx.x ] = tmp_buffer[0];
418: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
419: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
420: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
421: }
422: }
424: // M = 8:
425: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
426: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
427: PetscInt size, PetscScalar *group_results)
428: {
429: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
430: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
431: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
432: PetscInt vec_start_index = blockIdx.x * entries_per_group;
433: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
435: PetscScalar entry_x = 0;
436: PetscScalar group_sum0 = 0;
437: PetscScalar group_sum1 = 0;
438: PetscScalar group_sum2 = 0;
439: PetscScalar group_sum3 = 0;
440: PetscScalar group_sum4 = 0;
441: PetscScalar group_sum5 = 0;
442: PetscScalar group_sum6 = 0;
443: PetscScalar group_sum7 = 0;
444: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
445: entry_x = x[i]; // load only once from global memory!
446: group_sum0 += entry_x * y0[i];
447: group_sum1 += entry_x * y1[i];
448: group_sum2 += entry_x * y2[i];
449: group_sum3 += entry_x * y3[i];
450: group_sum4 += entry_x * y4[i];
451: group_sum5 += entry_x * y5[i];
452: group_sum6 += entry_x * y6[i];
453: group_sum7 += entry_x * y7[i];
454: }
455: tmp_buffer[threadIdx.x] = group_sum0;
456: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
457: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
458: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
459: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
460: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
461: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
462: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
464: // parallel reduction
465: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
466: __syncthreads();
467: if (threadIdx.x < stride) {
468: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
469: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
470: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
471: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
472: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
473: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
474: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
475: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
476: }
477: }
479: // write result of group to group_results
480: if (threadIdx.x == 0) {
481: group_results[blockIdx.x ] = tmp_buffer[0];
482: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
483: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
484: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
485: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
486: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
487: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
488: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
489: }
490: }
491: #endif /* !defined(PETSC_USE_COMPLEX) */
493: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
494: {
495: PetscInt i,n = xin->map->n,current_y_index = 0;
496: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
497: #if !defined(PETSC_USE_COMPLEX)
498: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
499: PetscScalar *group_results_gpu,*group_results_cpu;
500: #endif
501: PetscBLASInt one = 1,bn = 0;
502: cublasHandle_t cublasv2handle;
504: PetscCUBLASGetHandle(&cublasv2handle);
505: PetscBLASIntCast(xin->map->n,&bn);
507: /* Handle the case of local size zero first */
508: if (!xin->map->n) {
509: for (i=0; i<nv; ++i) z[i] = 0;
510: return 0;
511: }
513: #if !defined(PETSC_USE_COMPLEX)
514: // allocate scratchpad memory for the results of individual work groups:
515: cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
516: PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
517: #endif
518: VecCUDAGetArrayRead(xin,&xptr);
519: PetscLogGpuTimeBegin();
521: while (current_y_index < nv)
522: {
523: switch (nv - current_y_index) {
525: case 7:
526: case 6:
527: case 5:
528: case 4:
529: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
530: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
531: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
532: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
533: #if defined(PETSC_USE_COMPLEX)
534: cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
535: cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
536: cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
537: cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
538: #else
539: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
540: #endif
541: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
542: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
543: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
544: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
545: current_y_index += 4;
546: break;
548: case 3:
549: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
550: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
551: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
553: #if defined(PETSC_USE_COMPLEX)
554: cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
555: cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
556: cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
557: #else
558: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
559: #endif
560: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
561: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
562: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
563: current_y_index += 3;
564: break;
566: case 2:
567: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
568: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
569: #if defined(PETSC_USE_COMPLEX)
570: cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
571: cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
572: #else
573: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
574: #endif
575: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
576: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
577: current_y_index += 2;
578: break;
580: case 1:
581: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
582: cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
583: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
584: current_y_index += 1;
585: break;
587: default: // 8 or more vectors left
588: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
589: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
590: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
591: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
592: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
593: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
594: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
595: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
596: #if defined(PETSC_USE_COMPLEX)
597: cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
598: cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
599: cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
600: cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
601: cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);
602: cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);
603: cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);
604: cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);
605: #else
606: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
607: #endif
608: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
609: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
610: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
611: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
612: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
613: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
614: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
615: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
616: current_y_index += 8;
617: break;
618: }
619: }
620: PetscLogGpuTimeEnd();
621: VecCUDARestoreArrayRead(xin,&xptr);
623: #if defined(PETSC_USE_COMPLEX)
624: PetscLogGpuToCpuScalar(nv*sizeof(PetscScalar));
625: #else
626: // copy results to CPU
627: cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);
629: // sum group results into z
630: for (j=0; j<nv1; ++j) {
631: z[j] = 0;
632: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
633: }
634: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
635: cudaFree(group_results_gpu);
636: PetscFree(group_results_cpu);
637: PetscLogGpuToCpuScalar(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
638: #endif
639: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
640: return 0;
641: }
643: #undef MDOT_WORKGROUP_SIZE
644: #undef MDOT_WORKGROUP_NUM
646: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
647: {
648: PetscInt n = xin->map->n;
649: PetscScalar *xarray = NULL;
650: thrust::device_ptr<PetscScalar> xptr;
652: VecCUDAGetArrayWrite(xin,&xarray);
653: PetscLogGpuTimeBegin();
654: if (alpha == (PetscScalar)0.0) {
655: cudaMemset(xarray,0,n*sizeof(PetscScalar));
656: } else {
657: try {
658: xptr = thrust::device_pointer_cast(xarray);
659: thrust::fill(xptr,xptr+n,alpha);
660: } catch (char *ex) {
661: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
662: }
663: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
664: }
665: PetscLogGpuTimeEnd();
666: VecCUDARestoreArrayWrite(xin,&xarray);
667: return 0;
668: }
670: struct PetscScalarReciprocal
671: {
672: __host__ __device__
673: PetscScalar operator()(const PetscScalar& s)
674: {
675: return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
676: }
677: };
679: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
680: {
681: PetscInt n;
682: PetscScalar *x;
684: VecGetLocalSize(v,&n);
685: VecCUDAGetArray(v,&x);
686: PetscLogGpuTimeBegin();
687: try {
688: auto xptr = thrust::device_pointer_cast(x);
689: thrust::transform(VecCUDAThrustPolicy(v),xptr,xptr+n,xptr,PetscScalarReciprocal());
690: } catch (char *ex) {
691: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
692: }
693: PetscLogGpuTimeEnd();
694: VecCUDARestoreArray(v,&x);
695: return 0;
696: }
698: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
699: {
700: PetscScalar *xarray;
701: PetscBLASInt one = 1,bn = 0;
702: cublasHandle_t cublasv2handle;
704: if (alpha == (PetscScalar)0.0) {
705: VecSet_SeqCUDA(xin,alpha);
706: } else if (alpha != (PetscScalar)1.0) {
707: PetscCUBLASGetHandle(&cublasv2handle);
708: PetscBLASIntCast(xin->map->n,&bn);
709: VecCUDAGetArray(xin,&xarray);
710: PetscLogGpuTimeBegin();
711: cublasXscal(cublasv2handle,bn,&alpha,xarray,one);
712: VecCUDARestoreArray(xin,&xarray);
713: PetscLogGpuTimeEnd();
714: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
715: PetscLogGpuFlops(xin->map->n);
716: }
717: return 0;
718: }
720: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
721: {
722: const PetscScalar *xarray,*yarray;
723: PetscBLASInt one = 1,bn = 0;
724: cublasHandle_t cublasv2handle;
726: PetscCUBLASGetHandle(&cublasv2handle);
727: PetscBLASIntCast(xin->map->n,&bn);
728: VecCUDAGetArrayRead(xin,&xarray);
729: VecCUDAGetArrayRead(yin,&yarray);
730: PetscLogGpuTimeBegin();
731: cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);
732: PetscLogGpuTimeEnd();
733: if (xin->map->n > 0) {
734: PetscLogGpuFlops(2.0*xin->map->n-1);
735: }
736: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
737: VecCUDARestoreArrayRead(yin,&yarray);
738: VecCUDARestoreArrayRead(xin,&xarray);
739: return 0;
740: }
742: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
743: {
744: const PetscScalar *xarray;
745: PetscScalar *yarray;
747: if (xin != yin) {
748: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
749: PetscBool yiscuda;
751: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
752: VecCUDAGetArrayRead(xin,&xarray);
753: if (yiscuda) {
754: VecCUDAGetArrayWrite(yin,&yarray);
755: } else {
756: VecGetArrayWrite(yin,&yarray);
757: }
758: PetscLogGpuTimeBegin();
759: if (yiscuda) {
760: cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
761: } else {
762: cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
763: PetscLogGpuToCpu((yin->map->n)*sizeof(PetscScalar));
764: }
765: PetscLogGpuTimeEnd();
766: VecCUDARestoreArrayRead(xin,&xarray);
767: if (yiscuda) {
768: VecCUDARestoreArrayWrite(yin,&yarray);
769: } else {
770: VecRestoreArrayWrite(yin,&yarray);
771: }
772: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
773: /* copy in CPU if we are on the CPU */
774: VecCopy_SeqCUDA_Private(xin,yin);
775: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
776: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
777: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
778: /* copy in CPU */
779: VecCopy_SeqCUDA_Private(xin,yin);
780: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
781: /* copy in GPU */
782: VecCUDAGetArrayRead(xin,&xarray);
783: VecCUDAGetArrayWrite(yin,&yarray);
784: PetscLogGpuTimeBegin();
785: cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
786: PetscLogGpuTimeEnd();
787: VecCUDARestoreArrayRead(xin,&xarray);
788: VecCUDARestoreArrayWrite(yin,&yarray);
789: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
790: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
791: default to copy in GPU (this is an arbitrary choice) */
792: VecCUDAGetArrayRead(xin,&xarray);
793: VecCUDAGetArrayWrite(yin,&yarray);
794: PetscLogGpuTimeBegin();
795: cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
796: PetscLogGpuTimeEnd();
797: VecCUDARestoreArrayRead(xin,&xarray);
798: VecCUDARestoreArrayWrite(yin,&yarray);
799: } else {
800: VecCopy_SeqCUDA_Private(xin,yin);
801: }
802: }
803: }
804: return 0;
805: }
807: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
808: {
809: PetscBLASInt one = 1,bn = 0;
810: PetscScalar *xarray,*yarray;
811: cublasHandle_t cublasv2handle;
813: PetscCUBLASGetHandle(&cublasv2handle);
814: PetscBLASIntCast(xin->map->n,&bn);
815: if (xin != yin) {
816: VecCUDAGetArray(xin,&xarray);
817: VecCUDAGetArray(yin,&yarray);
818: PetscLogGpuTimeBegin();
819: cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);
820: PetscLogGpuTimeEnd();
821: VecCUDARestoreArray(xin,&xarray);
822: VecCUDARestoreArray(yin,&yarray);
823: }
824: return 0;
825: }
827: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
828: {
829: PetscScalar a = alpha,b = beta;
830: const PetscScalar *xarray;
831: PetscScalar *yarray;
832: PetscBLASInt one = 1, bn = 0;
833: cublasHandle_t cublasv2handle;
835: PetscCUBLASGetHandle(&cublasv2handle);
836: PetscBLASIntCast(yin->map->n,&bn);
837: if (a == (PetscScalar)0.0) {
838: VecScale_SeqCUDA(yin,beta);
839: } else if (b == (PetscScalar)1.0) {
840: VecAXPY_SeqCUDA(yin,alpha,xin);
841: } else if (a == (PetscScalar)1.0) {
842: VecAYPX_SeqCUDA(yin,beta,xin);
843: } else if (b == (PetscScalar)0.0) {
844: VecCUDAGetArrayRead(xin,&xarray);
845: VecCUDAGetArray(yin,&yarray);
846: PetscLogGpuTimeBegin();
847: cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
848: cublasXscal(cublasv2handle,bn,&alpha,yarray,one);
849: PetscLogGpuTimeEnd();
850: PetscLogGpuFlops(xin->map->n);
851: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
852: VecCUDARestoreArrayRead(xin,&xarray);
853: VecCUDARestoreArray(yin,&yarray);
854: } else {
855: VecCUDAGetArrayRead(xin,&xarray);
856: VecCUDAGetArray(yin,&yarray);
857: PetscLogGpuTimeBegin();
858: cublasXscal(cublasv2handle,bn,&beta,yarray,one);
859: cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
860: VecCUDARestoreArrayRead(xin,&xarray);
861: VecCUDARestoreArray(yin,&yarray);
862: PetscLogGpuTimeEnd();
863: PetscLogGpuFlops(3.0*xin->map->n);
864: PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
865: }
866: return 0;
867: }
869: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
870: {
871: PetscInt n = zin->map->n;
873: if (gamma == (PetscScalar)1.0) {
874: /* z = ax + b*y + z */
875: VecAXPY_SeqCUDA(zin,alpha,xin);
876: VecAXPY_SeqCUDA(zin,beta,yin);
877: PetscLogGpuFlops(4.0*n);
878: } else {
879: /* z = a*x + b*y + c*z */
880: VecScale_SeqCUDA(zin,gamma);
881: VecAXPY_SeqCUDA(zin,alpha,xin);
882: VecAXPY_SeqCUDA(zin,beta,yin);
883: PetscLogGpuFlops(5.0*n);
884: }
885: return 0;
886: }
888: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
889: {
890: PetscInt n = win->map->n;
891: const PetscScalar *xarray,*yarray;
892: PetscScalar *warray;
893: thrust::device_ptr<const PetscScalar> xptr,yptr;
894: thrust::device_ptr<PetscScalar> wptr;
896: if (xin->boundtocpu || yin->boundtocpu) {
897: VecPointwiseMult_Seq(win,xin,yin);
898: return 0;
899: }
900: VecCUDAGetArrayRead(xin,&xarray);
901: VecCUDAGetArrayRead(yin,&yarray);
902: VecCUDAGetArrayWrite(win,&warray);
903: PetscLogGpuTimeBegin();
904: try {
905: wptr = thrust::device_pointer_cast(warray);
906: xptr = thrust::device_pointer_cast(xarray);
907: yptr = thrust::device_pointer_cast(yarray);
908: thrust::transform(VecCUDAThrustPolicy(win),xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
909: } catch (char *ex) {
910: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
911: }
912: PetscLogGpuTimeEnd();
913: VecCUDARestoreArrayRead(xin,&xarray);
914: VecCUDARestoreArrayRead(yin,&yarray);
915: VecCUDARestoreArrayWrite(win,&warray);
916: PetscLogGpuFlops(n);
917: return 0;
918: }
920: /* should do infinity norm in cuda */
922: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
923: {
924: PetscInt n = xin->map->n;
925: PetscBLASInt one = 1, bn = 0;
926: const PetscScalar *xarray;
927: cublasHandle_t cublasv2handle;
929: PetscCUBLASGetHandle(&cublasv2handle);
930: PetscBLASIntCast(n,&bn);
931: if (type == NORM_2 || type == NORM_FROBENIUS) {
932: VecCUDAGetArrayRead(xin,&xarray);
933: PetscLogGpuTimeBegin();
934: cublasXnrm2(cublasv2handle,bn,xarray,one,z);
935: PetscLogGpuTimeEnd();
936: VecCUDARestoreArrayRead(xin,&xarray);
937: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
938: } else if (type == NORM_INFINITY) {
939: int i;
940: VecCUDAGetArrayRead(xin,&xarray);
941: PetscLogGpuTimeBegin();
942: cublasIXamax(cublasv2handle,bn,xarray,one,&i);
943: PetscLogGpuTimeEnd();
944: if (bn) {
945: PetscScalar zs;
946: cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);
947: *z = PetscAbsScalar(zs);
948: } else *z = 0.0;
949: VecCUDARestoreArrayRead(xin,&xarray);
950: } else if (type == NORM_1) {
951: VecCUDAGetArrayRead(xin,&xarray);
952: PetscLogGpuTimeBegin();
953: cublasXasum(cublasv2handle,bn,xarray,one,z);
954: VecCUDARestoreArrayRead(xin,&xarray);
955: PetscLogGpuTimeEnd();
956: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
957: } else if (type == NORM_1_AND_2) {
958: VecNorm_SeqCUDA(xin,NORM_1,z);
959: VecNorm_SeqCUDA(xin,NORM_2,z+1);
960: }
961: PetscLogGpuToCpuScalar(sizeof(PetscReal));
962: return 0;
963: }
965: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
966: {
967: VecDot_SeqCUDA(s,t,dp);
968: VecDot_SeqCUDA(t,t,nm);
969: return 0;
970: }
972: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
973: {
974: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
976: if (v->spptr) {
977: if (veccuda->GPUarray_allocated) {
978: #if defined(PETSC_HAVE_NVSHMEM)
979: if (veccuda->nvshmem) {
980: PetscNvshmemFree(veccuda->GPUarray_allocated);
981: veccuda->nvshmem = PETSC_FALSE;
982: }
983: else
984: #endif
985: cudaFree(veccuda->GPUarray_allocated);
986: veccuda->GPUarray_allocated = NULL;
987: }
988: if (veccuda->stream) {
989: cudaStreamDestroy(veccuda->stream);
990: }
991: }
992: VecDestroy_SeqCUDA_Private(v);
993: PetscFree(v->spptr);
994: return 0;
995: }
997: #if defined(PETSC_USE_COMPLEX)
998: struct conjugate
999: {
1000: __host__ __device__
1001: PetscScalar operator()(const PetscScalar& x)
1002: {
1003: return PetscConj(x);
1004: }
1005: };
1006: #endif
1008: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1009: {
1010: #if defined(PETSC_USE_COMPLEX)
1011: PetscScalar *xarray;
1012: PetscInt n = xin->map->n;
1013: thrust::device_ptr<PetscScalar> xptr;
1015: VecCUDAGetArray(xin,&xarray);
1016: PetscLogGpuTimeBegin();
1017: try {
1018: xptr = thrust::device_pointer_cast(xarray);
1019: thrust::transform(VecCUDAThrustPolicy(xin),xptr,xptr+n,xptr,conjugate());
1020: } catch (char *ex) {
1021: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1022: }
1023: PetscLogGpuTimeEnd();
1024: VecCUDARestoreArray(xin,&xarray);
1025: #else
1026: #endif
1027: return 0;
1028: }
1030: static inline PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1031: {
1032: PetscBool wisseqcuda;
1037: PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1038: if (w->data && wisseqcuda) {
1039: if (((Vec_Seq*)w->data)->array_allocated) {
1040: if (w->pinned_memory) {
1041: PetscMallocSetCUDAHost();
1042: }
1043: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1044: if (w->pinned_memory) {
1045: PetscMallocResetCUDAHost();
1046: w->pinned_memory = PETSC_FALSE;
1047: }
1048: }
1049: ((Vec_Seq*)w->data)->array = NULL;
1050: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1051: }
1052: if (w->spptr && wisseqcuda) {
1053: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1054: cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);
1055: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1056: }
1057: if (((Vec_CUDA*)w->spptr)->stream) {
1058: cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);
1059: }
1060: PetscFree(w->spptr);
1061: }
1063: if (v->petscnative && wisseqcuda) {
1064: PetscFree(w->data);
1065: w->data = v->data;
1066: w->offloadmask = v->offloadmask;
1067: w->pinned_memory = v->pinned_memory;
1068: w->spptr = v->spptr;
1069: PetscObjectStateIncrease((PetscObject)w);
1070: } else {
1071: if (read) {
1072: VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1073: } else {
1074: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1075: }
1076: w->offloadmask = PETSC_OFFLOAD_CPU;
1077: if (wisseqcuda) {
1078: VecCUDAAllocateCheck(w);
1079: }
1080: }
1081: return 0;
1082: }
1084: static inline PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1085: {
1086: PetscBool wisseqcuda;
1091: PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1092: if (v->petscnative && wisseqcuda) {
1093: v->data = w->data;
1094: v->offloadmask = w->offloadmask;
1095: v->pinned_memory = w->pinned_memory;
1096: v->spptr = w->spptr;
1097: w->data = 0;
1098: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1099: w->spptr = 0;
1100: } else {
1101: if (read) {
1102: VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1103: } else {
1104: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1105: }
1106: if ((Vec_CUDA*)w->spptr && wisseqcuda) {
1107: cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);
1108: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1109: if (((Vec_CUDA*)v->spptr)->stream) {
1110: cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);
1111: }
1112: PetscFree(w->spptr);
1113: }
1114: }
1115: return 0;
1116: }
1118: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1119: {
1120: VecGetLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1121: return 0;
1122: }
1124: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v,Vec w)
1125: {
1126: VecGetLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1127: return 0;
1128: }
1130: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1131: {
1132: VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1133: return 0;
1134: }
1136: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v,Vec w)
1137: {
1138: VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1139: return 0;
1140: }
1142: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1143: {
1144: __host__ __device__
1145: PetscReal operator()(const PetscScalar& x) {
1146: return PetscRealPart(x);
1147: }
1148: };
1150: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1151: {
1152: __host__ __device__
1153: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1154: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1155: }
1156: };
1158: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1159: {
1160: __host__ __device__
1161: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1162: return x < y ? y : x;
1163: }
1164: };
1166: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1167: {
1168: __host__ __device__
1169: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1170: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1171: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1172: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1173: }
1174: };
1176: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1177: {
1178: __host__ __device__
1179: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1180: return x < y ? x : y;
1181: }
1182: };
1184: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1185: {
1186: __host__ __device__
1187: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1188: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1189: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1190: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1191: }
1192: };
1194: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1195: {
1196: PetscInt n = v->map->n;
1197: const PetscScalar *av;
1198: thrust::device_ptr<const PetscScalar> avpt;
1201: if (!n) {
1202: *m = PETSC_MIN_REAL;
1203: if (p) *p = -1;
1204: return 0;
1205: }
1206: VecCUDAGetArrayRead(v,&av);
1207: avpt = thrust::device_pointer_cast(av);
1208: PetscLogGpuTimeBegin();
1209: if (p) {
1210: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1211: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1212: try {
1213: #if defined(PETSC_USE_COMPLEX)
1214: res = thrust::transform_reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,petscrealparti(),res,petscmaxi());
1215: #else
1216: res = thrust::reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,res,petscmaxi());
1217: #endif
1218: } catch (char *ex) {
1219: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1220: }
1221: *m = res.get<0>();
1222: *p = res.get<1>();
1223: } else {
1224: try {
1225: #if defined(PETSC_USE_COMPLEX)
1226: *m = thrust::transform_reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1227: #else
1228: *m = thrust::reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1229: #endif
1230: } catch (char *ex) {
1231: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1232: }
1233: }
1234: PetscLogGpuTimeEnd();
1235: VecCUDARestoreArrayRead(v,&av);
1236: return 0;
1237: }
1239: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1240: {
1241: PetscInt n = v->map->n;
1242: const PetscScalar *av;
1243: thrust::device_ptr<const PetscScalar> avpt;
1246: if (!n) {
1247: *m = PETSC_MAX_REAL;
1248: if (p) *p = -1;
1249: return 0;
1250: }
1251: VecCUDAGetArrayRead(v,&av);
1252: avpt = thrust::device_pointer_cast(av);
1253: PetscLogGpuTimeBegin();
1254: if (p) {
1255: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1256: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1257: try {
1258: #if defined(PETSC_USE_COMPLEX)
1259: res = thrust::transform_reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,petscrealparti(),res,petscmini());
1260: #else
1261: res = thrust::reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,res,petscmini());
1262: #endif
1263: } catch (char *ex) {
1264: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1265: }
1266: *m = res.get<0>();
1267: *p = res.get<1>();
1268: } else {
1269: try {
1270: #if defined(PETSC_USE_COMPLEX)
1271: *m = thrust::transform_reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1272: #else
1273: *m = thrust::reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1274: #endif
1275: } catch (char *ex) {
1276: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1277: }
1278: }
1279: PetscLogGpuTimeEnd();
1280: VecCUDARestoreArrayRead(v,&av);
1281: return 0;
1282: }
1284: PetscErrorCode VecSum_SeqCUDA(Vec v,PetscScalar *sum)
1285: {
1286: PetscInt n = v->map->n;
1287: const PetscScalar *a;
1288: thrust::device_ptr<const PetscScalar> dptr;
1291: VecCUDAGetArrayRead(v,&a);
1292: dptr = thrust::device_pointer_cast(a);
1293: PetscLogGpuTimeBegin();
1294: try {
1295: *sum = thrust::reduce(VecCUDAThrustPolicy(v),dptr,dptr+n,PetscScalar(0.0));
1296: } catch (char *ex) {
1297: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1298: }
1299: PetscLogGpuTimeEnd();
1300: VecCUDARestoreArrayRead(v,&a);
1301: return 0;
1302: }
1304: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1305: {
1306: const PetscScalar shift_;
1307: petscshift(PetscScalar shift) : shift_(shift){}
1308: __host__ __device__
1309: PetscScalar operator()(PetscScalar x) {return x + shift_;}
1310: };
1312: PetscErrorCode VecShift_SeqCUDA(Vec v,PetscScalar shift)
1313: {
1314: PetscInt n = v->map->n;
1315: PetscScalar *a;
1316: thrust::device_ptr<PetscScalar> dptr;
1319: VecCUDAGetArray(v,&a);
1320: dptr = thrust::device_pointer_cast(a);
1321: PetscLogGpuTimeBegin();
1322: try {
1323: thrust::transform(VecCUDAThrustPolicy(v),dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1324: } catch (char *ex) {
1325: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1326: }
1327: PetscLogGpuTimeEnd();
1328: VecCUDARestoreArray(v,&a);
1329: return 0;
1330: }
1332: #if defined(PETSC_HAVE_NVSHMEM)
1333: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1334: New array does not retain values in the old array. The offload mask is not changed.
1336: Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1337: */
1338: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1339: {
1340: cudaError_t cerr;
1341: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1342: PetscInt n;
1344: cudaFree(veccuda->GPUarray_allocated);
1345: VecGetLocalSize(v,&n);
1346: MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1347: PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1348: veccuda->GPUarray = veccuda->GPUarray_allocated;
1349: veccuda->nvshmem = PETSC_TRUE;
1350: return 0;
1351: }
1352: #endif