Actual source code: veccuda2.cu
petsc-3.9.4 2018-09-11
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
17: /*
18: Allocates space for the vector array on the GPU if it does not exist.
19: Does NOT change the PetscCUDAFlag for the vector
20: Does NOT zero the CUDA array
22: */
23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
24: {
26: cudaError_t err;
27: cudaStream_t stream;
28: Vec_CUDA *veccuda;
31: if (!v->spptr) {
32: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
33: veccuda = (Vec_CUDA*)v->spptr;
34: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
35: veccuda->GPUarray = veccuda->GPUarray_allocated;
36: err = cudaStreamCreate(&stream);CHKERRCUDA(err);
37: veccuda->stream = stream;
38: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
39: if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
42: } else {
43: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
44: }
45: }
46: }
47: return(0);
48: }
50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
51: PetscErrorCode VecCUDACopyToGPU(Vec v)
52: {
54: cudaError_t err;
55: Vec_CUDA *veccuda;
56: PetscScalar *varray;
60: VecCUDAAllocateCheck(v);
61: if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
62: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
63: veccuda=(Vec_CUDA*)v->spptr;
64: varray=veccuda->GPUarray;
65: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
66: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
67: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
68: }
69: return(0);
70: }
72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
73: {
74: PetscScalar *varray;
76: cudaError_t err;
77: PetscScalar *cpuPtr, *gpuPtr;
78: Vec_Seq *s;
79: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
83: VecCUDAAllocateCheck(v);
84: if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
85: s = (Vec_Seq*)v->data;
87: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
88: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
89: gpuPtr = varray + ptop_scatter->recvLowestIndex;
90: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
92: /* Note : this code copies the smallest contiguous chunk of data
93: containing ALL of the indices */
94: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
96: // Set the buffer states
97: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
98: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
99: }
100: return(0);
101: }
104: /*
105: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
106: */
107: PetscErrorCode VecCUDACopyFromGPU(Vec v)
108: {
110: cudaError_t err;
111: Vec_CUDA *veccuda;
112: PetscScalar *varray;
116: VecCUDAAllocateCheckHost(v);
117: if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
118: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
119: veccuda=(Vec_CUDA*)v->spptr;
120: varray=veccuda->GPUarray;
121: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
122: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
123: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
124: }
125: return(0);
126: }
128: /* Note that this function only copies *some* of the values up from the GPU to CPU,
129: which means that we need recombine the data at some point before using any of the standard functions.
130: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
131: where you have to always call in pairs
132: */
133: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
134: {
135: const PetscScalar *varray, *gpuPtr;
136: PetscErrorCode ierr;
137: cudaError_t err;
138: PetscScalar *cpuPtr;
139: Vec_Seq *s;
140: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
144: VecCUDAAllocateCheckHost(v);
145: if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
146: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
148: varray=((Vec_CUDA*)v->spptr)->GPUarray;
149: s = (Vec_Seq*)v->data;
150: gpuPtr = varray + ptop_scatter->sendLowestIndex;
151: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
153: /* Note : this code copies the smallest contiguous chunk of data
154: containing ALL of the indices */
155: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
157: VecCUDARestoreArrayRead(v,&varray);
158: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
159: }
160: return(0);
161: }
163: /*MC
164: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
166: Options Database Keys:
167: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
169: Level: beginner
171: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
172: M*/
174: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
175: {
176: const PetscScalar *xarray;
177: PetscScalar *yarray;
178: PetscErrorCode ierr;
179: PetscBLASInt one=1,bn;
180: PetscScalar sone=1.0;
181: cublasHandle_t cublasv2handle;
182: cublasStatus_t cberr;
183: cudaError_t err;
186: PetscCUBLASGetHandle(&cublasv2handle);
187: PetscBLASIntCast(yin->map->n,&bn);
188: VecCUDAGetArrayRead(xin,&xarray);
189: VecCUDAGetArrayReadWrite(yin,&yarray);
190: if (alpha == (PetscScalar)0.0) {
191: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
192: } else if (alpha == (PetscScalar)1.0) {
193: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
194: PetscLogFlops(2.0*yin->map->n);
195: } else {
196: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
197: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
198: PetscLogFlops(2.0*yin->map->n);
199: }
200: WaitForGPU();CHKERRCUDA(ierr);
201: VecCUDARestoreArrayRead(xin,&xarray);
202: VecCUDARestoreArrayReadWrite(yin,&yarray);
203: return(0);
204: }
206: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
207: {
208: const PetscScalar *xarray;
209: PetscScalar *yarray;
210: PetscErrorCode ierr;
211: PetscBLASInt one=1,bn;
212: cublasHandle_t cublasv2handle;
213: cublasStatus_t cberr;
216: PetscCUBLASGetHandle(&cublasv2handle);
217: if (alpha != (PetscScalar)0.0) {
218: PetscBLASIntCast(yin->map->n,&bn);
219: VecCUDAGetArrayRead(xin,&xarray);
220: VecCUDAGetArrayReadWrite(yin,&yarray);
221: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222: WaitForGPU();CHKERRCUDA(ierr);
223: VecCUDARestoreArrayRead(xin,&xarray);
224: VecCUDARestoreArrayReadWrite(yin,&yarray);
225: PetscLogFlops(2.0*yin->map->n);
226: }
227: return(0);
228: }
230: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
231: {
232: PetscInt n = xin->map->n;
233: const PetscScalar *xarray=NULL,*yarray=NULL;
234: PetscScalar *warray=NULL;
235: thrust::device_ptr<const PetscScalar> xptr,yptr;
236: thrust::device_ptr<PetscScalar> wptr;
237: PetscErrorCode ierr;
240: VecCUDAGetArrayWrite(win,&warray);
241: VecCUDAGetArrayRead(xin,&xarray);
242: VecCUDAGetArrayRead(yin,&yarray);
243: try {
244: wptr = thrust::device_pointer_cast(warray);
245: xptr = thrust::device_pointer_cast(xarray);
246: yptr = thrust::device_pointer_cast(yarray);
247: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
248: WaitForGPU();CHKERRCUDA(ierr);
249: } catch (char *ex) {
250: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
251: }
252: PetscLogFlops(n);
253: VecCUDARestoreArrayRead(xin,&xarray);
254: VecCUDARestoreArrayRead(yin,&yarray);
255: VecCUDARestoreArrayWrite(win,&warray);
256: return(0);
257: }
259: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
260: {
261: const PetscScalar *xarray=NULL,*yarray=NULL;
262: PetscScalar *warray=NULL;
263: PetscErrorCode ierr;
264: PetscBLASInt one=1,bn;
265: cublasHandle_t cublasv2handle;
266: cublasStatus_t cberr;
267: cudaError_t err;
270: PetscCUBLASGetHandle(&cublasv2handle);
271: PetscBLASIntCast(win->map->n,&bn);
272: if (alpha == (PetscScalar)0.0) {
273: VecCopy_SeqCUDA(yin,win);
274: } else {
275: VecCUDAGetArrayRead(xin,&xarray);
276: VecCUDAGetArrayRead(yin,&yarray);
277: VecCUDAGetArrayWrite(win,&warray);
278: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
279: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
280: PetscLogFlops(2*win->map->n);
281: WaitForGPU();CHKERRCUDA(ierr);
282: VecCUDARestoreArrayRead(xin,&xarray);
283: VecCUDARestoreArrayRead(yin,&yarray);
284: VecCUDARestoreArrayWrite(win,&warray);
285: }
286: return(0);
287: }
289: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
290: {
292: PetscInt n = xin->map->n,j,j_rem;
293: PetscScalar alpha0,alpha1,alpha2,alpha3;
296: PetscLogFlops(nv*2.0*n);
297: switch (j_rem=nv&0x3) {
298: case 3:
299: alpha0 = alpha[0];
300: alpha1 = alpha[1];
301: alpha2 = alpha[2];
302: alpha += 3;
303: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
304: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
305: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
306: y += 3;
307: break;
308: case 2:
309: alpha0 = alpha[0];
310: alpha1 = alpha[1];
311: alpha +=2;
312: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
313: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
314: y +=2;
315: break;
316: case 1:
317: alpha0 = *alpha++;
318: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
319: y +=1;
320: break;
321: }
322: for (j=j_rem; j<nv; j+=4) {
323: alpha0 = alpha[0];
324: alpha1 = alpha[1];
325: alpha2 = alpha[2];
326: alpha3 = alpha[3];
327: alpha += 4;
328: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
329: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
330: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
331: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
332: y += 4;
333: }
334: WaitForGPU();CHKERRCUDA(ierr);
335: return(0);
336: }
338: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
339: {
340: const PetscScalar *xarray,*yarray;
341: PetscErrorCode ierr;
342: PetscBLASInt one=1,bn;
343: cublasHandle_t cublasv2handle;
344: cublasStatus_t cberr;
347: PetscCUBLASGetHandle(&cublasv2handle);
348: PetscBLASIntCast(yin->map->n,&bn);
349: VecCUDAGetArrayRead(xin,&xarray);
350: VecCUDAGetArrayRead(yin,&yarray);
351: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
352: cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
353: WaitForGPU();CHKERRCUDA(ierr);
354: if (xin->map->n >0) {
355: PetscLogFlops(2.0*xin->map->n-1);
356: }
357: VecCUDARestoreArrayRead(xin,&xarray);
358: VecCUDARestoreArrayRead(yin,&yarray);
359: return(0);
360: }
362: //
363: // CUDA kernels for MDot to follow
364: //
366: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
367: #define MDOT_WORKGROUP_SIZE 128
368: #define MDOT_WORKGROUP_NUM 128
370: #if !defined(PETSC_USE_COMPLEX)
371: // M = 2:
372: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
373: PetscInt size, PetscScalar *group_results)
374: {
375: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
376: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
377: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
378: PetscInt vec_start_index = blockIdx.x * entries_per_group;
379: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
381: PetscScalar entry_x = 0;
382: PetscScalar group_sum0 = 0;
383: PetscScalar group_sum1 = 0;
384: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
385: entry_x = x[i]; // load only once from global memory!
386: group_sum0 += entry_x * y0[i];
387: group_sum1 += entry_x * y1[i];
388: }
389: tmp_buffer[threadIdx.x] = group_sum0;
390: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
392: // parallel reduction
393: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
394: __syncthreads();
395: if (threadIdx.x < stride) {
396: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
397: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
398: }
399: }
401: // write result of group to group_results
402: if (threadIdx.x == 0) {
403: group_results[blockIdx.x] = tmp_buffer[0];
404: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
405: }
406: }
408: // M = 3:
409: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
410: PetscInt size, PetscScalar *group_results)
411: {
412: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
413: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
414: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
415: PetscInt vec_start_index = blockIdx.x * entries_per_group;
416: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
418: PetscScalar entry_x = 0;
419: PetscScalar group_sum0 = 0;
420: PetscScalar group_sum1 = 0;
421: PetscScalar group_sum2 = 0;
422: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
423: entry_x = x[i]; // load only once from global memory!
424: group_sum0 += entry_x * y0[i];
425: group_sum1 += entry_x * y1[i];
426: group_sum2 += entry_x * y2[i];
427: }
428: tmp_buffer[threadIdx.x] = group_sum0;
429: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
430: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
432: // parallel reduction
433: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
434: __syncthreads();
435: if (threadIdx.x < stride) {
436: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
437: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
438: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
439: }
440: }
442: // write result of group to group_results
443: if (threadIdx.x == 0) {
444: group_results[blockIdx.x ] = tmp_buffer[0];
445: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
446: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
447: }
448: }
450: // M = 4:
451: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
452: PetscInt size, PetscScalar *group_results)
453: {
454: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
455: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
456: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
457: PetscInt vec_start_index = blockIdx.x * entries_per_group;
458: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
460: PetscScalar entry_x = 0;
461: PetscScalar group_sum0 = 0;
462: PetscScalar group_sum1 = 0;
463: PetscScalar group_sum2 = 0;
464: PetscScalar group_sum3 = 0;
465: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
466: entry_x = x[i]; // load only once from global memory!
467: group_sum0 += entry_x * y0[i];
468: group_sum1 += entry_x * y1[i];
469: group_sum2 += entry_x * y2[i];
470: group_sum3 += entry_x * y3[i];
471: }
472: tmp_buffer[threadIdx.x] = group_sum0;
473: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
474: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
475: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
477: // parallel reduction
478: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479: __syncthreads();
480: if (threadIdx.x < stride) {
481: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
482: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
483: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
485: }
486: }
488: // write result of group to group_results
489: if (threadIdx.x == 0) {
490: group_results[blockIdx.x ] = tmp_buffer[0];
491: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
492: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
493: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
494: }
495: }
497: // M = 8:
498: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
499: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
500: PetscInt size, PetscScalar *group_results)
501: {
502: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
503: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
504: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
505: PetscInt vec_start_index = blockIdx.x * entries_per_group;
506: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
508: PetscScalar entry_x = 0;
509: PetscScalar group_sum0 = 0;
510: PetscScalar group_sum1 = 0;
511: PetscScalar group_sum2 = 0;
512: PetscScalar group_sum3 = 0;
513: PetscScalar group_sum4 = 0;
514: PetscScalar group_sum5 = 0;
515: PetscScalar group_sum6 = 0;
516: PetscScalar group_sum7 = 0;
517: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
518: entry_x = x[i]; // load only once from global memory!
519: group_sum0 += entry_x * y0[i];
520: group_sum1 += entry_x * y1[i];
521: group_sum2 += entry_x * y2[i];
522: group_sum3 += entry_x * y3[i];
523: group_sum4 += entry_x * y4[i];
524: group_sum5 += entry_x * y5[i];
525: group_sum6 += entry_x * y6[i];
526: group_sum7 += entry_x * y7[i];
527: }
528: tmp_buffer[threadIdx.x] = group_sum0;
529: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
530: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
531: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
532: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
533: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
534: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
535: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
537: // parallel reduction
538: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
539: __syncthreads();
540: if (threadIdx.x < stride) {
541: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
542: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
543: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
544: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
545: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
546: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
547: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
548: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
549: }
550: }
552: // write result of group to group_results
553: if (threadIdx.x == 0) {
554: group_results[blockIdx.x ] = tmp_buffer[0];
555: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
556: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
557: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
558: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
559: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
560: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
561: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
562: }
563: }
564: #endif /* !defined(PETSC_USE_COMPLEX) */
566: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
567: {
568: PetscErrorCode ierr;
569: PetscInt i,n = xin->map->n,current_y_index = 0;
570: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
571: PetscScalar *group_results_gpu;
572: #if !defined(PETSC_USE_COMPLEX)
573: PetscInt j;
574: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
575: #endif
576: cudaError_t cuda_ierr;
577: PetscBLASInt one=1,bn;
578: cublasHandle_t cublasv2handle;
579: cublasStatus_t cberr;
582: PetscCUBLASGetHandle(&cublasv2handle);
583: PetscBLASIntCast(xin->map->n,&bn);
584: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
585: /* Handle the case of local size zero first */
586: if (!xin->map->n) {
587: for (i=0; i<nv; ++i) z[i] = 0;
588: return(0);
589: }
591: // allocate scratchpad memory for the results of individual work groups:
592: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
594: VecCUDAGetArrayRead(xin,&xptr);
596: while (current_y_index < nv)
597: {
598: switch (nv - current_y_index) {
600: case 7:
601: case 6:
602: case 5:
603: case 4:
604: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
605: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
606: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
607: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
609: #if defined(PETSC_USE_COMPLEX)
610: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
611: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
612: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
613: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
614: #else
615: // run kernel:
616: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
618: // copy results back to
619: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
621: // sum group results into z:
622: for (j=0; j<4; ++j) {
623: z[current_y_index + j] = 0;
624: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
625: }
626: #endif
627: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
628: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
629: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
630: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
631: current_y_index += 4;
632: break;
634: case 3:
635: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
636: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
637: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
639: #if defined(PETSC_USE_COMPLEX)
640: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
641: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
642: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
643: #else
644: // run kernel:
645: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
647: // copy results back to
648: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
650: // sum group results into z:
651: for (j=0; j<3; ++j) {
652: z[current_y_index + j] = 0;
653: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
654: }
655: #endif
657: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
658: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
659: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
660: current_y_index += 3;
661: break;
663: case 2:
664: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
665: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
667: #if defined(PETSC_USE_COMPLEX)
668: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
669: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
670: #else
671: // run kernel:
672: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
674: // copy results back to
675: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
677: // sum group results into z:
678: for (j=0; j<2; ++j) {
679: z[current_y_index + j] = 0;
680: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
681: }
682: #endif
683: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
684: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
685: current_y_index += 2;
686: break;
688: case 1:
689: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
690: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
691: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
692: current_y_index += 1;
693: break;
695: default: // 8 or more vectors left
696: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
697: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
698: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
699: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
700: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
701: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
702: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
703: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
705: #if defined(PETSC_USE_COMPLEX)
706: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
707: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
708: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
709: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
710: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
711: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
712: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
713: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
714: #else
715: // run kernel:
716: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
718: // copy results back to
719: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
721: // sum group results into z:
722: for (j=0; j<8; ++j) {
723: z[current_y_index + j] = 0;
724: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
725: }
726: #endif
727: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
728: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
729: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
730: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
731: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
732: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
733: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
734: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
735: current_y_index += 8;
736: break;
737: }
738: }
739: VecCUDARestoreArrayRead(xin,&xptr);
741: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
742: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
743: return(0);
744: }
746: #undef MDOT_WORKGROUP_SIZE
747: #undef MDOT_WORKGROUP_NUM
749: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
750: {
751: PetscInt n = xin->map->n;
752: PetscScalar *xarray=NULL;
753: thrust::device_ptr<PetscScalar> xptr;
754: PetscErrorCode ierr;
755: cudaError_t err;
758: VecCUDAGetArrayWrite(xin,&xarray);
759: if (alpha == (PetscScalar)0.0) {
760: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
761: } else {
762: try {
763: xptr = thrust::device_pointer_cast(xarray);
764: thrust::fill(xptr,xptr+n,alpha);
765: } catch (char *ex) {
766: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
767: }
768: }
769: WaitForGPU();CHKERRCUDA(ierr);
770: VecCUDARestoreArrayWrite(xin,&xarray);
771: return(0);
772: }
774: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
775: {
776: PetscScalar *xarray;
778: PetscBLASInt one=1,bn;
779: cublasHandle_t cublasv2handle;
780: cublasStatus_t cberr;
783: if (alpha == (PetscScalar)0.0) {
784: VecSet_SeqCUDA(xin,alpha);
785: } else if (alpha != (PetscScalar)1.0) {
786: PetscCUBLASGetHandle(&cublasv2handle);
787: PetscBLASIntCast(xin->map->n,&bn);
788: VecCUDAGetArrayReadWrite(xin,&xarray);
789: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
790: VecCUDARestoreArrayReadWrite(xin,&xarray);
791: }
792: WaitForGPU();CHKERRCUDA(ierr);
793: PetscLogFlops(xin->map->n);
794: return(0);
795: }
797: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
798: {
799: const PetscScalar *xarray,*yarray;
800: PetscErrorCode ierr;
801: PetscBLASInt one=1,bn;
802: cublasHandle_t cublasv2handle;
803: cublasStatus_t cberr;
806: PetscCUBLASGetHandle(&cublasv2handle);
807: PetscBLASIntCast(xin->map->n,&bn);
808: VecCUDAGetArrayRead(xin,&xarray);
809: VecCUDAGetArrayRead(yin,&yarray);
810: cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
811: WaitForGPU();CHKERRCUDA(ierr);
812: if (xin->map->n > 0) {
813: PetscLogFlops(2.0*xin->map->n-1);
814: }
815: VecCUDARestoreArrayRead(yin,&yarray);
816: VecCUDARestoreArrayRead(xin,&xarray);
817: return(0);
818: }
820: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
821: {
822: const PetscScalar *xarray;
823: PetscScalar *yarray;
824: PetscErrorCode ierr;
825: cudaError_t err;
828: if (xin != yin) {
829: if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
830: VecCUDAGetArrayRead(xin,&xarray);
831: VecCUDAGetArrayWrite(yin,&yarray);
832: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
833: WaitForGPU();CHKERRCUDA(ierr);
834: VecCUDARestoreArrayRead(xin,&xarray);
835: VecCUDARestoreArrayWrite(yin,&yarray);
837: } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
838: /* copy in CPU if we are on the CPU*/
839: VecCopy_SeqCUDA_Private(xin,yin);
840: } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
841: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
842: if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
843: /* copy in CPU */
844: VecCopy_SeqCUDA_Private(xin,yin);
846: } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
847: /* copy in GPU */
848: VecCUDAGetArrayRead(xin,&xarray);
849: VecCUDAGetArrayWrite(yin,&yarray);
850: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
851: VecCUDARestoreArrayRead(xin,&xarray);
852: VecCUDARestoreArrayWrite(yin,&yarray);
853: } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
854: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
855: default to copy in GPU (this is an arbitrary choice) */
856: VecCUDAGetArrayRead(xin,&xarray);
857: VecCUDAGetArrayWrite(yin,&yarray);
858: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
859: VecCUDARestoreArrayRead(xin,&xarray);
860: VecCUDARestoreArrayWrite(yin,&yarray);
861: } else {
862: VecCopy_SeqCUDA_Private(xin,yin);
863: }
864: }
865: }
866: return(0);
867: }
869: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
870: {
872: PetscBLASInt one = 1,bn;
873: PetscScalar *xarray,*yarray;
874: cublasHandle_t cublasv2handle;
875: cublasStatus_t cberr;
878: PetscCUBLASGetHandle(&cublasv2handle);
879: PetscBLASIntCast(xin->map->n,&bn);
880: if (xin != yin) {
881: VecCUDAGetArrayReadWrite(xin,&xarray);
882: VecCUDAGetArrayReadWrite(yin,&yarray);
883: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
884: WaitForGPU();CHKERRCUDA(ierr);
885: VecCUDARestoreArrayReadWrite(xin,&xarray);
886: VecCUDARestoreArrayReadWrite(yin,&yarray);
887: }
888: return(0);
889: }
891: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
892: {
893: PetscErrorCode ierr;
894: PetscScalar a = alpha,b = beta;
895: const PetscScalar *xarray;
896: PetscScalar *yarray;
897: PetscBLASInt one = 1, bn;
898: cublasHandle_t cublasv2handle;
899: cublasStatus_t cberr;
900: cudaError_t err;
903: PetscCUBLASGetHandle(&cublasv2handle);
904: PetscBLASIntCast(yin->map->n,&bn);
905: if (a == (PetscScalar)0.0) {
906: VecScale_SeqCUDA(yin,beta);
907: } else if (b == (PetscScalar)1.0) {
908: VecAXPY_SeqCUDA(yin,alpha,xin);
909: } else if (a == (PetscScalar)1.0) {
910: VecAYPX_SeqCUDA(yin,beta,xin);
911: } else if (b == (PetscScalar)0.0) {
912: VecCUDAGetArrayRead(xin,&xarray);
913: VecCUDAGetArrayReadWrite(yin,&yarray);
914: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
915: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
916: PetscLogFlops(xin->map->n);
917: WaitForGPU();CHKERRCUDA(ierr);
918: VecCUDARestoreArrayRead(xin,&xarray);
919: VecCUDARestoreArrayReadWrite(yin,&yarray);
920: } else {
921: VecCUDAGetArrayRead(xin,&xarray);
922: VecCUDAGetArrayReadWrite(yin,&yarray);
923: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
924: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
925: VecCUDARestoreArrayRead(xin,&xarray);
926: VecCUDARestoreArrayReadWrite(yin,&yarray);
927: WaitForGPU();CHKERRCUDA(ierr);
928: PetscLogFlops(3.0*xin->map->n);
929: }
930: return(0);
931: }
933: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
934: {
936: PetscInt n = zin->map->n;
939: if (gamma == (PetscScalar)1.0) {
940: /* z = ax + b*y + z */
941: VecAXPY_SeqCUDA(zin,alpha,xin);
942: VecAXPY_SeqCUDA(zin,beta,yin);
943: PetscLogFlops(4.0*n);
944: } else {
945: /* z = a*x + b*y + c*z */
946: VecScale_SeqCUDA(zin,gamma);
947: VecAXPY_SeqCUDA(zin,alpha,xin);
948: VecAXPY_SeqCUDA(zin,beta,yin);
949: PetscLogFlops(5.0*n);
950: }
951: WaitForGPU();CHKERRCUDA(ierr);
952: return(0);
953: }
955: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
956: {
957: PetscInt n = win->map->n;
958: const PetscScalar *xarray,*yarray;
959: PetscScalar *warray;
960: thrust::device_ptr<const PetscScalar> xptr,yptr;
961: thrust::device_ptr<PetscScalar> wptr;
962: PetscErrorCode ierr;
965: VecCUDAGetArrayReadWrite(win,&warray);
966: VecCUDAGetArrayRead(xin,&xarray);
967: VecCUDAGetArrayRead(yin,&yarray);
968: try {
969: wptr = thrust::device_pointer_cast(warray);
970: xptr = thrust::device_pointer_cast(xarray);
971: yptr = thrust::device_pointer_cast(yarray);
972: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
973: WaitForGPU();CHKERRCUDA(ierr);
974: } catch (char *ex) {
975: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
976: }
977: VecCUDARestoreArrayRead(xin,&xarray);
978: VecCUDARestoreArrayRead(yin,&yarray);
979: VecCUDARestoreArrayReadWrite(win,&warray);
980: PetscLogFlops(n);
981: return(0);
982: }
984: /* should do infinity norm in cuda */
986: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
987: {
988: PetscErrorCode ierr;
989: PetscInt n = xin->map->n;
990: PetscBLASInt one = 1, bn;
991: const PetscScalar *xarray;
992: cublasHandle_t cublasv2handle;
993: cublasStatus_t cberr;
994: cudaError_t err;
997: PetscCUBLASGetHandle(&cublasv2handle);
998: PetscBLASIntCast(n,&bn);
999: if (type == NORM_2 || type == NORM_FROBENIUS) {
1000: VecCUDAGetArrayRead(xin,&xarray);
1001: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1002: WaitForGPU();CHKERRCUDA(ierr);
1003: VecCUDARestoreArrayRead(xin,&xarray);
1004: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1005: } else if (type == NORM_INFINITY) {
1006: int i;
1007: VecCUDAGetArrayRead(xin,&xarray);
1008: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1009: err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1010: VecCUDARestoreArrayRead(xin,&xarray);
1011: } else if (type == NORM_1) {
1012: VecCUDAGetArrayRead(xin,&xarray);
1013: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1014: VecCUDARestoreArrayRead(xin,&xarray);
1015: WaitForGPU();CHKERRCUDA(ierr);
1016: PetscLogFlops(PetscMax(n-1.0,0.0));
1017: } else if (type == NORM_1_AND_2) {
1018: VecNorm_SeqCUDA(xin,NORM_1,z);
1019: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1020: }
1021: return(0);
1022: }
1024: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1025: {
1026: PetscErrorCode ierr;
1027: PetscReal n=s->map->n;
1028: const PetscScalar *sarray,*tarray;
1031: VecCUDAGetArrayRead(s,&sarray);
1032: VecCUDAGetArrayRead(t,&tarray);
1033: VecDot_SeqCUDA(s,t,dp);
1034: VecDot_SeqCUDA(t,t,nm);
1035: VecCUDARestoreArrayRead(s,&sarray);
1036: VecCUDARestoreArrayRead(t,&tarray);
1037: WaitForGPU();CHKERRCUDA(ierr);
1038: PetscLogFlops(4.0*n);
1039: return(0);
1040: }
1042: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1043: {
1045: cudaError_t err;
1048: if (v->spptr) {
1049: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1050: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1051: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1052: }
1053: if (((Vec_CUDA*)v->spptr)->stream) {
1054: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1055: }
1056: PetscFree(v->spptr);
1057: }
1058: VecDestroy_SeqCUDA_Private(v);
1059: return(0);
1060: }
1062: #if defined(PETSC_USE_COMPLEX)
1063: struct conjugate
1064: {
1065: __host__ __device__
1066: PetscScalar operator()(PetscScalar x)
1067: {
1068: return PetscConj(x);
1069: }
1070: };
1071: #endif
1073: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1074: {
1075: PetscScalar *xarray;
1076: PetscErrorCode ierr;
1077: #if defined(PETSC_USE_COMPLEX)
1078: PetscInt n = xin->map->n;
1079: thrust::device_ptr<PetscScalar> xptr;
1080: #endif
1083: VecCUDAGetArrayReadWrite(xin,&xarray);
1084: #if defined(PETSC_USE_COMPLEX)
1085: try {
1086: xptr = thrust::device_pointer_cast(xarray);
1087: thrust::transform(xptr,xptr+n,xptr,conjugate());
1088: WaitForGPU();CHKERRCUDA(ierr);
1089: } catch (char *ex) {
1090: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1091: }
1092: #endif
1093: VecCUDARestoreArrayReadWrite(xin,&xarray);
1094: return(0);
1095: }
1097: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1098: {
1100: cudaError_t err;
1107: if (w->data) {
1108: if (((Vec_Seq*)w->data)->array_allocated) {
1109: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1110: }
1111: ((Vec_Seq*)w->data)->array = NULL;
1112: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1113: }
1114: if (w->spptr) {
1115: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1116: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1117: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1118: }
1119: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1120: PetscFree(w->spptr);
1121: }
1123: if (v->petscnative) {
1124: PetscFree(w->data);
1125: w->data = v->data;
1126: w->valid_GPU_array = v->valid_GPU_array;
1127: w->spptr = v->spptr;
1128: PetscObjectStateIncrease((PetscObject)w);
1129: } else {
1130: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1131: w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1132: VecCUDAAllocateCheck(w);
1133: }
1134: return(0);
1135: }
1137: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1138: {
1140: cudaError_t err;
1147: if (v->petscnative) {
1148: v->data = w->data;
1149: v->valid_GPU_array = w->valid_GPU_array;
1150: v->spptr = w->spptr;
1151: VecCUDACopyFromGPU(v);
1152: PetscObjectStateIncrease((PetscObject)v);
1153: w->data = 0;
1154: w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1155: w->spptr = 0;
1156: } else {
1157: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1158: if ((Vec_CUDA*)w->spptr) {
1159: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1160: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1161: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1162: PetscFree(w->spptr);
1163: }
1164: }
1165: return(0);
1166: }
1168: /*@C
1169: VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1171: This function has semantics similar to VecGetArray(): the pointer
1172: returned by this function points to a consistent view of the vector
1173: data. This may involve a copy operation of data from the host to the
1174: device if the data on the device is out of date. If the device
1175: memory hasn't been allocated previously it will be allocated as part
1176: of this function call. VecCUDAGetArrayReadWrite() assumes that
1177: the user will modify the vector data. This is similar to
1178: intent(inout) in fortran.
1180: The CUDA device pointer has to be released by calling
1181: VecCUDARestoreArrayReadWrite(). Upon restoring the vector data
1182: the data on the host will be marked as out of date. A subsequent
1183: access of the host data will thus incur a data transfer from the
1184: device to the host.
1187: Input Parameter:
1188: . v - the vector
1190: Output Parameter:
1191: . a - the CUDA device pointer
1193: Fortran note:
1194: This function is not currently available from Fortran.
1196: Level: intermediate
1198: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1199: @*/
1200: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1201: {
1206: *a = 0;
1207: VecCUDACopyToGPU(v);
1208: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1209: return(0);
1210: }
1212: /*@C
1213: VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().
1215: This marks the host data as out of date. Subsequent access to the
1216: vector data on the host side with for instance VecGetArray() incurs a
1217: data transfer.
1219: Input Parameter:
1220: + v - the vector
1221: - a - the CUDA device pointer. This pointer is invalid after
1222: VecCUDARestoreArrayReadWrite() returns.
1224: Fortran note:
1225: This function is not currently available from Fortran.
1227: Level: intermediate
1229: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1230: @*/
1231: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1232: {
1237: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
1239: PetscObjectStateIncrease((PetscObject)v);
1240: return(0);
1241: }
1243: /*@C
1244: VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.
1246: This function is analogous to VecGetArrayRead(): The pointer
1247: returned by this function points to a consistent view of the vector
1248: data. This may involve a copy operation of data from the host to the
1249: device if the data on the device is out of date. If the device
1250: memory hasn't been allocated previously it will be allocated as part
1251: of this function call. VecCUDAGetArrayRead() assumes that the
1252: user will not modify the vector data. This is analgogous to
1253: intent(in) in Fortran.
1255: The CUDA device pointer has to be released by calling
1256: VecCUDARestoreArrayRead(). If the data on the host side was
1257: previously up to date it will remain so, i.e. data on both the device
1258: and the host is up to date. Accessing data on the host side does not
1259: incur a device to host data transfer.
1261: Input Parameter:
1262: . v - the vector
1264: Output Parameter:
1265: . a - the CUDA pointer.
1267: Fortran note:
1268: This function is not currently available from Fortran.
1270: Level: intermediate
1272: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1273: @*/
1274: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1275: {
1280: *a = 0;
1281: VecCUDACopyToGPU(v);
1282: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1283: return(0);
1284: }
1286: /*@C
1287: VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().
1289: If the data on the host side was previously up to date it will remain
1290: so, i.e. data on both the device and the host is up to date.
1291: Accessing data on the host side e.g. with VecGetArray() does not
1292: incur a device to host data transfer.
1294: Input Parameter:
1295: + v - the vector
1296: - a - the CUDA device pointer. This pointer is invalid after
1297: VecCUDARestoreArrayRead() returns.
1299: Fortran note:
1300: This function is not currently available from Fortran.
1302: Level: intermediate
1304: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1305: @*/
1306: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1307: {
1310: return(0);
1311: }
1313: /*@C
1314: VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.
1316: The data pointed to by the device pointer is uninitialized. The user
1317: may not read from this data. Furthermore, the entire array needs to
1318: be filled by the user to obtain well-defined behaviour. The device
1319: memory will be allocated by this function if it hasn't been allocated
1320: previously. This is analogous to intent(out) in Fortran.
1322: The device pointer needs to be released with
1323: VecCUDARestoreArrayWrite(). When the pointer is released the
1324: host data of the vector is marked as out of data. Subsequent access
1325: of the host data with e.g. VecGetArray() incurs a device to host data
1326: transfer.
1329: Input Parameter:
1330: . v - the vector
1332: Output Parameter:
1333: . a - the CUDA pointer
1335: Fortran note:
1336: This function is not currently available from Fortran.
1338: Level: advanced
1340: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1341: @*/
1342: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1343: {
1348: *a = 0;
1349: VecCUDAAllocateCheck(v);
1350: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1351: return(0);
1352: }
1354: /*@C
1355: VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().
1357: Data on the host will be marked as out of date. Subsequent access of
1358: the data on the host side e.g. with VecGetArray() will incur a device
1359: to host data transfer.
1361: Input Parameter:
1362: + v - the vector
1363: - a - the CUDA device pointer. This pointer is invalid after
1364: VecCUDARestoreArrayWrite() returns.
1366: Fortran note:
1367: This function is not currently available from Fortran.
1369: Level: intermediate
1371: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1372: @*/
1373: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1374: {
1379: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
1381: PetscObjectStateIncrease((PetscObject)v);
1382: return(0);
1383: }
1385: /*@C
1386: VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1387: GPU array provided by the user. This is useful to avoid copying an
1388: array into a vector.
1390: Not Collective
1392: Input Parameters:
1393: + vec - the vector
1394: - array - the GPU array
1396: Notes:
1397: You can return to the original GPU array with a call to VecCUDAResetArray()
1398: It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1399: same time on the same vector.
1401: Level: developer
1403: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()
1405: @*/
1406: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1407: {
1412: VecCUDACopyToGPU(vin);
1413: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1414: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1415: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1416: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1417: PetscObjectStateIncrease((PetscObject)vin);
1418: return(0);
1419: }
1421: /*@C
1422: VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1423: with a GPU array provided by the user. This is useful to avoid copying
1424: a GPU array into a vector.
1426: Not Collective
1428: Input Parameters:
1429: + vec - the vector
1430: - array - the GPU array
1432: Notes:
1433: This permanently replaces the GPU array and frees the memory associated
1434: with the old GPU array.
1436: The memory passed in CANNOT be freed by the user. It will be freed
1437: when the vector is destroyed.
1439: Not supported from Fortran
1441: Level: developer
1443: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()
1445: @*/
1446: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1447: {
1448: cudaError_t err;
1453: err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1454: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1455: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1456: PetscObjectStateIncrease((PetscObject)vin);
1457: return(0);
1458: }
1460: /*@C
1461: VecCUDAResetArray - Resets a vector to use its default memory. Call this
1462: after the use of VecCUDAPlaceArray().
1464: Not Collective
1466: Input Parameters:
1467: . vec - the vector
1469: Level: developer
1471: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()
1473: @*/
1474: PetscErrorCode VecCUDAResetArray(Vec vin)
1475: {
1480: VecCUDACopyToGPU(vin);
1481: ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1482: ((Vec_Seq*)vin->data)->unplacedarray = 0;
1483: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1484: PetscObjectStateIncrease((PetscObject)vin);
1485: return(0);
1486: }