Actual source code: veccuda2.cu
petsc-3.12.5 2020-03-29
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
18: /*
19: Allocates space for the vector array on the GPU if it does not exist.
20: Does NOT change the PetscCUDAFlag for the vector
21: Does NOT zero the CUDA array
23: */
24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
25: {
27: cudaError_t err;
28: Vec_CUDA *veccuda;
31: if (!v->spptr) {
32: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
33: veccuda = (Vec_CUDA*)v->spptr;
34: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
35: veccuda->GPUarray = veccuda->GPUarray_allocated;
36: veccuda->stream = 0; /* using default stream */
37: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
38: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
39: if (v->data && ((Vec_Seq*)v->data)->array) {
40: v->offloadmask = PETSC_OFFLOAD_CPU;
41: } else {
42: v->offloadmask = PETSC_OFFLOAD_GPU;
43: }
44: }
45: }
46: return(0);
47: }
49: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
50: PetscErrorCode VecCUDACopyToGPU(Vec v)
51: {
53: cudaError_t err;
54: Vec_CUDA *veccuda;
55: PetscScalar *varray;
59: VecCUDAAllocateCheck(v);
60: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
61: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
62: veccuda = (Vec_CUDA*)v->spptr;
63: varray = veccuda->GPUarray;
64: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
65: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
66: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
67: v->offloadmask = PETSC_OFFLOAD_BOTH;
68: }
69: return(0);
70: }
72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
73: {
74: PetscScalar *varray;
76: cudaError_t err;
77: PetscScalar *cpuPtr, *gpuPtr;
78: Vec_Seq *s;
79: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
80: PetscInt lowestIndex,n;
84: VecCUDAAllocateCheck(v);
85: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
86: s = (Vec_Seq*)v->data;
87: if (mode & SCATTER_REVERSE) {
88: lowestIndex = ptop_scatter->sendLowestIndex;
89: n = ptop_scatter->ns;
90: } else {
91: lowestIndex = ptop_scatter->recvLowestIndex;
92: n = ptop_scatter->nr;
93: }
95: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
96: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
97: gpuPtr = varray + lowestIndex;
98: cpuPtr = s->array + lowestIndex;
100: /* Note : this code copies the smallest contiguous chunk of data
101: containing ALL of the indices */
102: err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
103: PetscLogCpuToGpu(n*sizeof(PetscScalar));
105: /* Set the buffer states */
106: v->offloadmask = PETSC_OFFLOAD_BOTH;
107: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
108: }
109: return(0);
110: }
113: /*
114: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
115: */
116: PetscErrorCode VecCUDACopyFromGPU(Vec v)
117: {
119: cudaError_t err;
120: Vec_CUDA *veccuda;
121: PetscScalar *varray;
125: VecCUDAAllocateCheckHost(v);
126: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
127: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
128: veccuda = (Vec_CUDA*)v->spptr;
129: varray = veccuda->GPUarray;
130: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
131: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
132: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
133: v->offloadmask = PETSC_OFFLOAD_BOTH;
134: }
135: return(0);
136: }
138: /* Note that this function only copies *some* of the values up from the GPU to CPU,
139: which means that we need recombine the data at some point before using any of the standard functions.
140: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
141: where you have to always call in pairs
142: */
143: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
144: {
145: const PetscScalar *varray, *gpuPtr;
146: PetscErrorCode ierr;
147: cudaError_t err;
148: PetscScalar *cpuPtr;
149: Vec_Seq *s;
150: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
151: PetscInt lowestIndex,n;
155: VecCUDAAllocateCheckHost(v);
156: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
157: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
158: if (mode & SCATTER_REVERSE) {
159: lowestIndex = ptop_scatter->recvLowestIndex;
160: n = ptop_scatter->nr;
161: } else {
162: lowestIndex = ptop_scatter->sendLowestIndex;
163: n = ptop_scatter->ns;
164: }
166: varray=((Vec_CUDA*)v->spptr)->GPUarray;
167: s = (Vec_Seq*)v->data;
168: gpuPtr = varray + lowestIndex;
169: cpuPtr = s->array + lowestIndex;
171: /* Note : this code copies the smallest contiguous chunk of data
172: containing ALL of the indices */
173: err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
174: PetscLogGpuToCpu(n*sizeof(PetscScalar));
176: VecCUDARestoreArrayRead(v,&varray);
177: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
178: }
179: return(0);
180: }
182: /*MC
183: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
185: Options Database Keys:
186: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
188: Level: beginner
190: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
191: M*/
193: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
194: {
195: const PetscScalar *xarray;
196: PetscScalar *yarray;
197: PetscErrorCode ierr;
198: PetscBLASInt one=1,bn;
199: PetscScalar sone=1.0;
200: cublasHandle_t cublasv2handle;
201: cublasStatus_t cberr;
202: cudaError_t err;
205: PetscCUBLASGetHandle(&cublasv2handle);
206: PetscBLASIntCast(yin->map->n,&bn);
207: VecCUDAGetArrayRead(xin,&xarray);
208: VecCUDAGetArray(yin,&yarray);
209: PetscLogGpuTimeBegin();
210: if (alpha == (PetscScalar)0.0) {
211: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
212: } else if (alpha == (PetscScalar)1.0) {
213: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
214: PetscLogGpuFlops(1.0*yin->map->n);
215: } else {
216: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
217: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
218: PetscLogGpuFlops(2.0*yin->map->n);
219: }
220: WaitForGPU();CHKERRCUDA(ierr);
221: PetscLogGpuTimeEnd();
222: VecCUDARestoreArrayRead(xin,&xarray);
223: VecCUDARestoreArray(yin,&yarray);
224: return(0);
225: }
227: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
228: {
229: const PetscScalar *xarray;
230: PetscScalar *yarray;
231: PetscErrorCode ierr;
232: PetscBLASInt one=1,bn;
233: cublasHandle_t cublasv2handle;
234: cublasStatus_t cberr;
237: PetscCUBLASGetHandle(&cublasv2handle);
238: if (alpha != (PetscScalar)0.0) {
239: PetscBLASIntCast(yin->map->n,&bn);
240: VecCUDAGetArrayRead(xin,&xarray);
241: VecCUDAGetArray(yin,&yarray);
242: PetscLogGpuTimeBegin();
243: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
244: WaitForGPU();CHKERRCUDA(ierr);
245: PetscLogGpuTimeEnd();
246: VecCUDARestoreArrayRead(xin,&xarray);
247: VecCUDARestoreArray(yin,&yarray);
248: PetscLogGpuFlops(2.0*yin->map->n);
249: }
250: return(0);
251: }
253: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
254: {
255: PetscInt n = xin->map->n;
256: const PetscScalar *xarray=NULL,*yarray=NULL;
257: PetscScalar *warray=NULL;
258: thrust::device_ptr<const PetscScalar> xptr,yptr;
259: thrust::device_ptr<PetscScalar> wptr;
260: PetscErrorCode ierr;
263: VecCUDAGetArrayWrite(win,&warray);
264: VecCUDAGetArrayRead(xin,&xarray);
265: VecCUDAGetArrayRead(yin,&yarray);
266: PetscLogGpuTimeBegin();
267: try {
268: wptr = thrust::device_pointer_cast(warray);
269: xptr = thrust::device_pointer_cast(xarray);
270: yptr = thrust::device_pointer_cast(yarray);
271: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
272: WaitForGPU();CHKERRCUDA(ierr);
273: } catch (char *ex) {
274: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
275: }
276: PetscLogGpuTimeEnd();
277: PetscLogGpuFlops(n);
278: VecCUDARestoreArrayRead(xin,&xarray);
279: VecCUDARestoreArrayRead(yin,&yarray);
280: VecCUDARestoreArrayWrite(win,&warray);
281: return(0);
282: }
284: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
285: {
286: const PetscScalar *xarray=NULL,*yarray=NULL;
287: PetscScalar *warray=NULL;
288: PetscErrorCode ierr;
289: PetscBLASInt one=1,bn;
290: cublasHandle_t cublasv2handle;
291: cublasStatus_t cberr;
292: cudaError_t err;
295: PetscCUBLASGetHandle(&cublasv2handle);
296: PetscBLASIntCast(win->map->n,&bn);
297: if (alpha == (PetscScalar)0.0) {
298: VecCopy_SeqCUDA(yin,win);
299: } else {
300: VecCUDAGetArrayRead(xin,&xarray);
301: VecCUDAGetArrayRead(yin,&yarray);
302: VecCUDAGetArrayWrite(win,&warray);
303: PetscLogGpuTimeBegin();
304: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
305: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
306: WaitForGPU();CHKERRCUDA(ierr);
307: PetscLogGpuTimeEnd();
308: PetscLogGpuFlops(2*win->map->n);
309: VecCUDARestoreArrayRead(xin,&xarray);
310: VecCUDARestoreArrayRead(yin,&yarray);
311: VecCUDARestoreArrayWrite(win,&warray);
312: }
313: return(0);
314: }
316: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
317: {
319: PetscInt n = xin->map->n,j,j_rem;
320: PetscScalar alpha0,alpha1,alpha2,alpha3;
323: PetscLogGpuFlops(nv*2.0*n);
324: switch (j_rem=nv&0x3) {
325: case 3:
326: alpha0 = alpha[0];
327: alpha1 = alpha[1];
328: alpha2 = alpha[2];
329: alpha += 3;
330: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
331: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
332: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
333: y += 3;
334: break;
335: case 2:
336: alpha0 = alpha[0];
337: alpha1 = alpha[1];
338: alpha +=2;
339: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
340: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
341: y +=2;
342: break;
343: case 1:
344: alpha0 = *alpha++;
345: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
346: y +=1;
347: break;
348: }
349: for (j=j_rem; j<nv; j+=4) {
350: alpha0 = alpha[0];
351: alpha1 = alpha[1];
352: alpha2 = alpha[2];
353: alpha3 = alpha[3];
354: alpha += 4;
355: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
356: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
357: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
358: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
359: y += 4;
360: }
361: WaitForGPU();CHKERRCUDA(ierr);
362: return(0);
363: }
365: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
366: {
367: const PetscScalar *xarray,*yarray;
368: PetscErrorCode ierr;
369: PetscBLASInt one=1,bn;
370: cublasHandle_t cublasv2handle;
371: cublasStatus_t cberr;
374: PetscCUBLASGetHandle(&cublasv2handle);
375: PetscBLASIntCast(yin->map->n,&bn);
376: VecCUDAGetArrayRead(xin,&xarray);
377: VecCUDAGetArrayRead(yin,&yarray);
378: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
379: PetscLogGpuTimeBegin();
380: cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
381: WaitForGPU();CHKERRCUDA(ierr);
382: PetscLogGpuTimeEnd();
383: if (xin->map->n >0) {
384: PetscLogGpuFlops(2.0*xin->map->n-1);
385: }
386: VecCUDARestoreArrayRead(xin,&xarray);
387: VecCUDARestoreArrayRead(yin,&yarray);
388: return(0);
389: }
391: //
392: // CUDA kernels for MDot to follow
393: //
395: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
396: #define MDOT_WORKGROUP_SIZE 128
397: #define MDOT_WORKGROUP_NUM 128
399: #if !defined(PETSC_USE_COMPLEX)
400: // M = 2:
401: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
402: PetscInt size, PetscScalar *group_results)
403: {
404: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
405: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
406: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
407: PetscInt vec_start_index = blockIdx.x * entries_per_group;
408: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
410: PetscScalar entry_x = 0;
411: PetscScalar group_sum0 = 0;
412: PetscScalar group_sum1 = 0;
413: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
414: entry_x = x[i]; // load only once from global memory!
415: group_sum0 += entry_x * y0[i];
416: group_sum1 += entry_x * y1[i];
417: }
418: tmp_buffer[threadIdx.x] = group_sum0;
419: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
421: // parallel reduction
422: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
423: __syncthreads();
424: if (threadIdx.x < stride) {
425: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
426: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
427: }
428: }
430: // write result of group to group_results
431: if (threadIdx.x == 0) {
432: group_results[blockIdx.x] = tmp_buffer[0];
433: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
434: }
435: }
437: // M = 3:
438: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
439: PetscInt size, PetscScalar *group_results)
440: {
441: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
442: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
443: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
444: PetscInt vec_start_index = blockIdx.x * entries_per_group;
445: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
447: PetscScalar entry_x = 0;
448: PetscScalar group_sum0 = 0;
449: PetscScalar group_sum1 = 0;
450: PetscScalar group_sum2 = 0;
451: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
452: entry_x = x[i]; // load only once from global memory!
453: group_sum0 += entry_x * y0[i];
454: group_sum1 += entry_x * y1[i];
455: group_sum2 += entry_x * y2[i];
456: }
457: tmp_buffer[threadIdx.x] = group_sum0;
458: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
459: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
461: // parallel reduction
462: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
463: __syncthreads();
464: if (threadIdx.x < stride) {
465: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
466: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
467: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
468: }
469: }
471: // write result of group to group_results
472: if (threadIdx.x == 0) {
473: group_results[blockIdx.x ] = tmp_buffer[0];
474: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
475: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
476: }
477: }
479: // M = 4:
480: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
481: PetscInt size, PetscScalar *group_results)
482: {
483: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
484: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
485: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
486: PetscInt vec_start_index = blockIdx.x * entries_per_group;
487: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
489: PetscScalar entry_x = 0;
490: PetscScalar group_sum0 = 0;
491: PetscScalar group_sum1 = 0;
492: PetscScalar group_sum2 = 0;
493: PetscScalar group_sum3 = 0;
494: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
495: entry_x = x[i]; // load only once from global memory!
496: group_sum0 += entry_x * y0[i];
497: group_sum1 += entry_x * y1[i];
498: group_sum2 += entry_x * y2[i];
499: group_sum3 += entry_x * y3[i];
500: }
501: tmp_buffer[threadIdx.x] = group_sum0;
502: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
503: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
504: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
506: // parallel reduction
507: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
508: __syncthreads();
509: if (threadIdx.x < stride) {
510: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
511: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
512: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
513: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
514: }
515: }
517: // write result of group to group_results
518: if (threadIdx.x == 0) {
519: group_results[blockIdx.x ] = tmp_buffer[0];
520: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
521: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
522: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
523: }
524: }
526: // M = 8:
527: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
528: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
529: PetscInt size, PetscScalar *group_results)
530: {
531: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
532: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
533: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
534: PetscInt vec_start_index = blockIdx.x * entries_per_group;
535: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
537: PetscScalar entry_x = 0;
538: PetscScalar group_sum0 = 0;
539: PetscScalar group_sum1 = 0;
540: PetscScalar group_sum2 = 0;
541: PetscScalar group_sum3 = 0;
542: PetscScalar group_sum4 = 0;
543: PetscScalar group_sum5 = 0;
544: PetscScalar group_sum6 = 0;
545: PetscScalar group_sum7 = 0;
546: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
547: entry_x = x[i]; // load only once from global memory!
548: group_sum0 += entry_x * y0[i];
549: group_sum1 += entry_x * y1[i];
550: group_sum2 += entry_x * y2[i];
551: group_sum3 += entry_x * y3[i];
552: group_sum4 += entry_x * y4[i];
553: group_sum5 += entry_x * y5[i];
554: group_sum6 += entry_x * y6[i];
555: group_sum7 += entry_x * y7[i];
556: }
557: tmp_buffer[threadIdx.x] = group_sum0;
558: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
559: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
560: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
561: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
562: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
563: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
564: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
566: // parallel reduction
567: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
568: __syncthreads();
569: if (threadIdx.x < stride) {
570: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
571: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
572: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
573: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
574: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
575: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
576: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
577: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
578: }
579: }
581: // write result of group to group_results
582: if (threadIdx.x == 0) {
583: group_results[blockIdx.x ] = tmp_buffer[0];
584: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
585: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
586: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
587: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
588: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
589: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
590: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
591: }
592: }
593: #endif /* !defined(PETSC_USE_COMPLEX) */
595: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
596: {
597: PetscErrorCode ierr;
598: PetscInt i,n = xin->map->n,current_y_index = 0;
599: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
600: PetscScalar *group_results_gpu;
601: #if !defined(PETSC_USE_COMPLEX)
602: PetscInt j;
603: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
604: #endif
605: cudaError_t cuda_ierr;
606: PetscBLASInt one=1,bn;
607: cublasHandle_t cublasv2handle;
608: cublasStatus_t cberr;
611: PetscCUBLASGetHandle(&cublasv2handle);
612: PetscBLASIntCast(xin->map->n,&bn);
613: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
614: /* Handle the case of local size zero first */
615: if (!xin->map->n) {
616: for (i=0; i<nv; ++i) z[i] = 0;
617: return(0);
618: }
620: // allocate scratchpad memory for the results of individual work groups:
621: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
623: VecCUDAGetArrayRead(xin,&xptr);
625: while (current_y_index < nv)
626: {
627: switch (nv - current_y_index) {
629: case 7:
630: case 6:
631: case 5:
632: case 4:
633: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
634: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
635: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
636: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
637: PetscLogGpuTimeBegin();
638: #if defined(PETSC_USE_COMPLEX)
639: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
640: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
641: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
642: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
643: #else
644: // run kernel:
645: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
646: PetscLogGpuTimeEnd();
648: // copy results back to
649: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
651: // sum group results into z:
652: for (j=0; j<4; ++j) {
653: z[current_y_index + j] = 0;
654: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
655: }
656: #endif
657: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
658: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
659: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
660: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
661: current_y_index += 4;
662: break;
664: case 3:
665: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
666: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
667: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
669: PetscLogGpuTimeBegin();
670: #if defined(PETSC_USE_COMPLEX)
671: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
672: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
673: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
674: #else
675: // run kernel:
676: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
677: PetscLogGpuTimeEnd();
679: // copy results back to
680: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
682: // sum group results into z:
683: for (j=0; j<3; ++j) {
684: z[current_y_index + j] = 0;
685: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
686: }
687: #endif
688: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
689: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
690: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
691: current_y_index += 3;
692: break;
694: case 2:
695: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
696: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
697: PetscLogGpuTimeBegin();
698: #if defined(PETSC_USE_COMPLEX)
699: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
700: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
701: #else
702: // run kernel:
703: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
704: PetscLogGpuTimeEnd();
706: // copy results back to
707: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
709: // sum group results into z:
710: for (j=0; j<2; ++j) {
711: z[current_y_index + j] = 0;
712: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
713: }
714: #endif
715: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
716: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
717: current_y_index += 2;
718: break;
720: case 1:
721: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
722: PetscLogGpuTimeBegin();
723: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
724: PetscLogGpuTimeEnd();
725: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
726: current_y_index += 1;
727: break;
729: default: // 8 or more vectors left
730: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
731: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
732: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
733: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
734: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
735: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
736: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
737: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
738: PetscLogGpuTimeBegin();
739: #if defined(PETSC_USE_COMPLEX)
740: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
741: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
742: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
743: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
744: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
745: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
746: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
747: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
748: #else
749: // run kernel:
750: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
751: PetscLogGpuTimeEnd();
753: // copy results back to
754: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
756: // sum group results into z:
757: for (j=0; j<8; ++j) {
758: z[current_y_index + j] = 0;
759: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
760: }
761: #endif
762: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
763: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
764: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
765: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
766: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
767: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
768: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
769: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
770: current_y_index += 8;
771: break;
772: }
773: }
774: VecCUDARestoreArrayRead(xin,&xptr);
776: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
777: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
778: return(0);
779: }
781: #undef MDOT_WORKGROUP_SIZE
782: #undef MDOT_WORKGROUP_NUM
784: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
785: {
786: PetscInt n = xin->map->n;
787: PetscScalar *xarray=NULL;
788: thrust::device_ptr<PetscScalar> xptr;
789: PetscErrorCode ierr;
790: cudaError_t err;
793: VecCUDAGetArrayWrite(xin,&xarray);
794: PetscLogGpuTimeBegin();
795: if (alpha == (PetscScalar)0.0) {
796: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
797: } else {
798: try {
799: xptr = thrust::device_pointer_cast(xarray);
800: thrust::fill(xptr,xptr+n,alpha);
801: } catch (char *ex) {
802: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
803: }
804: }
805: WaitForGPU();CHKERRCUDA(ierr);
806: PetscLogGpuTimeEnd();
807: VecCUDARestoreArrayWrite(xin,&xarray);
808: return(0);
809: }
811: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
812: {
813: PetscScalar *xarray;
815: PetscBLASInt one=1,bn;
816: cublasHandle_t cublasv2handle;
817: cublasStatus_t cberr;
820: if (alpha == (PetscScalar)0.0) {
821: VecSet_SeqCUDA(xin,alpha);
822: WaitForGPU();CHKERRCUDA(ierr);
823: } else if (alpha != (PetscScalar)1.0) {
824: PetscCUBLASGetHandle(&cublasv2handle);
825: PetscBLASIntCast(xin->map->n,&bn);
826: VecCUDAGetArray(xin,&xarray);
827: PetscLogGpuTimeBegin();
828: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
829: VecCUDARestoreArray(xin,&xarray);
830: WaitForGPU();CHKERRCUDA(ierr);
831: PetscLogGpuTimeEnd();
832: }
833: PetscLogGpuFlops(xin->map->n);
834: return(0);
835: }
837: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
838: {
839: const PetscScalar *xarray,*yarray;
840: PetscErrorCode ierr;
841: PetscBLASInt one=1,bn;
842: cublasHandle_t cublasv2handle;
843: cublasStatus_t cberr;
846: PetscCUBLASGetHandle(&cublasv2handle);
847: PetscBLASIntCast(xin->map->n,&bn);
848: VecCUDAGetArrayRead(xin,&xarray);
849: VecCUDAGetArrayRead(yin,&yarray);
850: PetscLogGpuTimeBegin();
851: cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
852: WaitForGPU();CHKERRCUDA(ierr);
853: PetscLogGpuTimeEnd();
854: if (xin->map->n > 0) {
855: PetscLogGpuFlops(2.0*xin->map->n-1);
856: }
857: VecCUDARestoreArrayRead(yin,&yarray);
858: VecCUDARestoreArrayRead(xin,&xarray);
859: return(0);
860: }
862: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
863: {
864: const PetscScalar *xarray;
865: PetscScalar *yarray;
866: PetscErrorCode ierr;
867: cudaError_t err;
870: if (xin != yin) {
871: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
872: PetscBool yiscuda;
874: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
875: VecCUDAGetArrayRead(xin,&xarray);
876: if (yiscuda) {
877: VecCUDAGetArrayWrite(yin,&yarray);
878: } else {
879: VecGetArrayWrite(yin,&yarray);
880: }
881: PetscLogGpuTimeBegin();
882: if (yiscuda) {
883: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
884: } else {
885: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
886: }
887: WaitForGPU();CHKERRCUDA(ierr);
888: PetscLogGpuTimeEnd();
889: VecCUDARestoreArrayRead(xin,&xarray);
890: if (yiscuda) {
891: VecCUDARestoreArrayWrite(yin,&yarray);
892: } else {
893: VecRestoreArrayWrite(yin,&yarray);
894: }
895: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
896: /* copy in CPU if we are on the CPU */
897: VecCopy_SeqCUDA_Private(xin,yin);
898: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
899: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
900: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
901: /* copy in CPU */
902: VecCopy_SeqCUDA_Private(xin,yin);
903: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
904: /* copy in GPU */
905: VecCUDAGetArrayRead(xin,&xarray);
906: VecCUDAGetArrayWrite(yin,&yarray);
907: PetscLogGpuTimeBegin();
908: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
909: PetscLogGpuTimeEnd();
910: VecCUDARestoreArrayRead(xin,&xarray);
911: VecCUDARestoreArrayWrite(yin,&yarray);
912: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
913: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
914: default to copy in GPU (this is an arbitrary choice) */
915: VecCUDAGetArrayRead(xin,&xarray);
916: VecCUDAGetArrayWrite(yin,&yarray);
917: PetscLogGpuTimeBegin();
918: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
919: PetscLogGpuTimeEnd();
920: VecCUDARestoreArrayRead(xin,&xarray);
921: VecCUDARestoreArrayWrite(yin,&yarray);
922: } else {
923: VecCopy_SeqCUDA_Private(xin,yin);
924: }
925: }
926: }
927: return(0);
928: }
930: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
931: {
933: PetscBLASInt one = 1,bn;
934: PetscScalar *xarray,*yarray;
935: cublasHandle_t cublasv2handle;
936: cublasStatus_t cberr;
939: PetscCUBLASGetHandle(&cublasv2handle);
940: PetscBLASIntCast(xin->map->n,&bn);
941: if (xin != yin) {
942: VecCUDAGetArray(xin,&xarray);
943: VecCUDAGetArray(yin,&yarray);
944: PetscLogGpuTimeBegin();
945: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
946: WaitForGPU();CHKERRCUDA(ierr);
947: PetscLogGpuTimeEnd();
948: VecCUDARestoreArray(xin,&xarray);
949: VecCUDARestoreArray(yin,&yarray);
950: }
951: return(0);
952: }
954: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
955: {
956: PetscErrorCode ierr;
957: PetscScalar a = alpha,b = beta;
958: const PetscScalar *xarray;
959: PetscScalar *yarray;
960: PetscBLASInt one = 1, bn;
961: cublasHandle_t cublasv2handle;
962: cublasStatus_t cberr;
963: cudaError_t err;
966: PetscCUBLASGetHandle(&cublasv2handle);
967: PetscBLASIntCast(yin->map->n,&bn);
968: if (a == (PetscScalar)0.0) {
969: VecScale_SeqCUDA(yin,beta);
970: } else if (b == (PetscScalar)1.0) {
971: VecAXPY_SeqCUDA(yin,alpha,xin);
972: } else if (a == (PetscScalar)1.0) {
973: VecAYPX_SeqCUDA(yin,beta,xin);
974: } else if (b == (PetscScalar)0.0) {
975: VecCUDAGetArrayRead(xin,&xarray);
976: VecCUDAGetArray(yin,&yarray);
977: PetscLogGpuTimeBegin();
978: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
979: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
980: WaitForGPU();CHKERRCUDA(ierr);
981: PetscLogGpuTimeEnd();
982: PetscLogGpuFlops(xin->map->n);
983: VecCUDARestoreArrayRead(xin,&xarray);
984: VecCUDARestoreArray(yin,&yarray);
985: } else {
986: VecCUDAGetArrayRead(xin,&xarray);
987: VecCUDAGetArray(yin,&yarray);
988: PetscLogGpuTimeBegin();
989: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
990: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
991: VecCUDARestoreArrayRead(xin,&xarray);
992: VecCUDARestoreArray(yin,&yarray);
993: WaitForGPU();CHKERRCUDA(ierr);
994: PetscLogGpuTimeEnd();
995: PetscLogGpuFlops(3.0*xin->map->n);
996: }
997: return(0);
998: }
1000: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1001: {
1003: PetscInt n = zin->map->n;
1006: if (gamma == (PetscScalar)1.0) {
1007: /* z = ax + b*y + z */
1008: VecAXPY_SeqCUDA(zin,alpha,xin);
1009: VecAXPY_SeqCUDA(zin,beta,yin);
1010: PetscLogGpuFlops(4.0*n);
1011: } else {
1012: /* z = a*x + b*y + c*z */
1013: VecScale_SeqCUDA(zin,gamma);
1014: VecAXPY_SeqCUDA(zin,alpha,xin);
1015: VecAXPY_SeqCUDA(zin,beta,yin);
1016: PetscLogGpuFlops(5.0*n);
1017: }
1018: WaitForGPU();CHKERRCUDA(ierr);
1019: return(0);
1020: }
1022: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1023: {
1024: PetscInt n = win->map->n;
1025: const PetscScalar *xarray,*yarray;
1026: PetscScalar *warray;
1027: thrust::device_ptr<const PetscScalar> xptr,yptr;
1028: thrust::device_ptr<PetscScalar> wptr;
1029: PetscErrorCode ierr;
1032: VecCUDAGetArray(win,&warray);
1033: VecCUDAGetArrayRead(xin,&xarray);
1034: VecCUDAGetArrayRead(yin,&yarray);
1035: PetscLogGpuTimeBegin();
1036: try {
1037: wptr = thrust::device_pointer_cast(warray);
1038: xptr = thrust::device_pointer_cast(xarray);
1039: yptr = thrust::device_pointer_cast(yarray);
1040: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1041: WaitForGPU();CHKERRCUDA(ierr);
1042: } catch (char *ex) {
1043: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1044: }
1045: PetscLogGpuTimeEnd();
1046: VecCUDARestoreArrayRead(xin,&xarray);
1047: VecCUDARestoreArrayRead(yin,&yarray);
1048: VecCUDARestoreArray(win,&warray);
1049: PetscLogGpuFlops(n);
1050: return(0);
1051: }
1053: /* should do infinity norm in cuda */
1055: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1056: {
1057: PetscErrorCode ierr;
1058: PetscInt n = xin->map->n;
1059: PetscBLASInt one = 1, bn;
1060: const PetscScalar *xarray;
1061: cublasHandle_t cublasv2handle;
1062: cublasStatus_t cberr;
1063: cudaError_t err;
1066: PetscCUBLASGetHandle(&cublasv2handle);
1067: PetscBLASIntCast(n,&bn);
1068: if (type == NORM_2 || type == NORM_FROBENIUS) {
1069: VecCUDAGetArrayRead(xin,&xarray);
1070: PetscLogGpuTimeBegin();
1071: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1072: WaitForGPU();CHKERRCUDA(ierr);
1073: PetscLogGpuTimeEnd();
1074: VecCUDARestoreArrayRead(xin,&xarray);
1075: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1076: } else if (type == NORM_INFINITY) {
1077: int i;
1078: VecCUDAGetArrayRead(xin,&xarray);
1079: PetscLogGpuTimeBegin();
1080: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1081: PetscLogGpuTimeEnd();
1082: if (bn) {
1083: PetscScalar zs;
1084: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1085: *z = PetscAbsScalar(zs);
1086: } else *z = 0.0;
1087: VecCUDARestoreArrayRead(xin,&xarray);
1088: } else if (type == NORM_1) {
1089: VecCUDAGetArrayRead(xin,&xarray);
1090: PetscLogGpuTimeBegin();
1091: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1092: VecCUDARestoreArrayRead(xin,&xarray);
1093: WaitForGPU();CHKERRCUDA(ierr);
1094: PetscLogGpuTimeEnd();
1095: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1096: } else if (type == NORM_1_AND_2) {
1097: VecNorm_SeqCUDA(xin,NORM_1,z);
1098: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1099: }
1100: return(0);
1101: }
1103: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1104: {
1105: PetscErrorCode ierr;
1106: PetscReal n=s->map->n;
1107: const PetscScalar *sarray,*tarray;
1110: VecCUDAGetArrayRead(s,&sarray);
1111: VecCUDAGetArrayRead(t,&tarray);
1112: VecDot_SeqCUDA(s,t,dp);
1113: VecDot_SeqCUDA(t,t,nm);
1114: VecCUDARestoreArrayRead(s,&sarray);
1115: VecCUDARestoreArrayRead(t,&tarray);
1116: WaitForGPU();CHKERRCUDA(ierr);
1117: PetscLogGpuFlops(4.0*n);
1118: return(0);
1119: }
1121: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1122: {
1124: cudaError_t err;
1127: if (v->spptr) {
1128: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1129: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1130: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1131: }
1132: if (((Vec_CUDA*)v->spptr)->stream) {
1133: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1134: }
1135: PetscFree(v->spptr);
1136: }
1137: VecDestroy_SeqCUDA_Private(v);
1138: return(0);
1139: }
1141: #if defined(PETSC_USE_COMPLEX)
1142: struct conjugate
1143: {
1144: __host__ __device__
1145: PetscScalar operator()(PetscScalar x)
1146: {
1147: return PetscConj(x);
1148: }
1149: };
1150: #endif
1152: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1153: {
1154: PetscScalar *xarray;
1155: PetscErrorCode ierr;
1156: #if defined(PETSC_USE_COMPLEX)
1157: PetscInt n = xin->map->n;
1158: thrust::device_ptr<PetscScalar> xptr;
1159: #endif
1162: VecCUDAGetArray(xin,&xarray);
1163: #if defined(PETSC_USE_COMPLEX)
1164: PetscLogGpuTimeBegin();
1165: try {
1166: xptr = thrust::device_pointer_cast(xarray);
1167: thrust::transform(xptr,xptr+n,xptr,conjugate());
1168: WaitForGPU();CHKERRCUDA(ierr);
1169: } catch (char *ex) {
1170: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1171: }
1172: PetscLogGpuTimeEnd();
1173: #endif
1174: VecCUDARestoreArray(xin,&xarray);
1175: return(0);
1176: }
1178: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1179: {
1181: cudaError_t err;
1188: if (w->data) {
1189: if (((Vec_Seq*)w->data)->array_allocated) {
1190: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1191: }
1192: ((Vec_Seq*)w->data)->array = NULL;
1193: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1194: }
1195: if (w->spptr) {
1196: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1197: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1198: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1199: }
1200: if (((Vec_CUDA*)v->spptr)->stream) {
1201: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1202: }
1203: PetscFree(w->spptr);
1204: }
1206: if (v->petscnative) {
1207: PetscFree(w->data);
1208: w->data = v->data;
1209: w->offloadmask = v->offloadmask;
1210: w->spptr = v->spptr;
1211: PetscObjectStateIncrease((PetscObject)w);
1212: } else {
1213: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1214: w->offloadmask = PETSC_OFFLOAD_CPU;
1215: VecCUDAAllocateCheck(w);
1216: }
1217: return(0);
1218: }
1220: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1221: {
1223: cudaError_t err;
1230: if (v->petscnative) {
1231: v->data = w->data;
1232: v->offloadmask = w->offloadmask;
1233: v->spptr = w->spptr;
1234: VecCUDACopyFromGPU(v);
1235: PetscObjectStateIncrease((PetscObject)v);
1236: w->data = 0;
1237: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1238: w->spptr = 0;
1239: } else {
1240: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1241: if ((Vec_CUDA*)w->spptr) {
1242: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1243: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1244: if (((Vec_CUDA*)v->spptr)->stream) {
1245: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1246: }
1247: PetscFree(w->spptr);
1248: }
1249: }
1250: return(0);
1251: }