Actual source code: veccuda2.cu
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/functional.h>
 15: #include <thrust/iterator/counting_iterator.h>
 16: #include <thrust/reduce.h>
 17: #include <thrust/transform.h>
 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/transform_reduce.h>
 20: #endif

 22: #if THRUST_VERSION >= 101600 && !PetscDefined(USE_DEBUG)
 23: static thrust::cuda_cub::par_nosync_t::stream_attachment_type VecCUDAThrustPolicy(Vec x) {
 24:     return thrust::cuda::par_nosync.on(((Vec_CUDA*)x->spptr)->stream);
 25: }
 26: #else
 27: static thrust::cuda_cub::par_t::stream_attachment_type VecCUDAThrustPolicy(Vec x) {
 28:     return thrust::cuda::par.on(((Vec_CUDA*)x->spptr)->stream);
 29: }
 30: #endif

 32: /*
 33:     Allocates space for the vector array on the GPU if it does not exist.
 34:     Does NOT change the PetscCUDAFlag for the vector
 35:     Does NOT zero the CUDA array

 37:  */
 38: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 39: {
 41:   Vec_CUDA       *veccuda;
 42:   PetscBool      option_set;

 44:   if (!v->spptr) {
 45:     PetscReal pinned_memory_min;
 46:     PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
 47:     veccuda = (Vec_CUDA*)v->spptr;
 48:     cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));
 49:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 50:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 51:       if (v->data && ((Vec_Seq*)v->data)->array) {
 52:         v->offloadmask = PETSC_OFFLOAD_CPU;
 53:       } else {
 54:         v->offloadmask = PETSC_OFFLOAD_GPU;
 55:       }
 56:     }
 57:     pinned_memory_min = 0;

 59:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 60:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 61:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 62:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 63:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 64:     PetscOptionsEnd();
 65:   }
 66:   return 0;
 67: }

 69: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 70: PetscErrorCode VecCUDACopyToGPU(Vec v)
 71: {
 72:   Vec_CUDA       *veccuda;
 73:   PetscScalar    *varray;

 76:   VecCUDAAllocateCheck(v);
 77:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 78:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 79:     veccuda        = (Vec_CUDA*)v->spptr;
 80:     varray         = veccuda->GPUarray;
 81:     cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);
 82:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 83:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 84:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 85:   }
 86:   return 0;
 87: }

 89: /*
 90:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 91: */
 92: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 93: {
 94:   Vec_CUDA       *veccuda;
 95:   PetscScalar    *varray;

 98:   VecCUDAAllocateCheckHost(v);
 99:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
100:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
101:     veccuda        = (Vec_CUDA*)v->spptr;
102:     varray         = veccuda->GPUarray;
103:     cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
104:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
105:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
106:     v->offloadmask = PETSC_OFFLOAD_BOTH;
107:   }
108:   return 0;
109: }

111: /*MC
112:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

114:    Options Database Keys:
115: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

117:   Level: beginner

119: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
120: M*/

122: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
123: {
124:   const PetscScalar *xarray;
125:   PetscScalar       *yarray;
126:   PetscBLASInt      one = 1,bn = 0;
127:   PetscScalar       sone = 1.0;
128:   cublasHandle_t    cublasv2handle;

130:   PetscCUBLASGetHandle(&cublasv2handle);
131:   PetscBLASIntCast(yin->map->n,&bn);
132:   VecCUDAGetArrayRead(xin,&xarray);
133:   VecCUDAGetArray(yin,&yarray);
134:   PetscLogGpuTimeBegin();
135:   if (alpha == (PetscScalar)0.0) {
136:     cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
137:   } else if (alpha == (PetscScalar)1.0) {
138:     cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
139:     PetscLogGpuFlops(1.0*yin->map->n);
140:   } else {
141:     cublasXscal(cublasv2handle,bn,&alpha,yarray,one);
142:     cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);
143:     PetscLogGpuFlops(2.0*yin->map->n);
144:   }
145:   PetscLogGpuTimeEnd();
146:   VecCUDARestoreArrayRead(xin,&xarray);
147:   VecCUDARestoreArray(yin,&yarray);
148:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
149:   return 0;
150: }

152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154:   const PetscScalar *xarray;
155:   PetscScalar       *yarray;
156:   PetscBLASInt      one = 1,bn = 0;
157:   cublasHandle_t    cublasv2handle;
158:   PetscBool         xiscuda;

160:   if (alpha == (PetscScalar)0.0) return 0;
161:   PetscCUBLASGetHandle(&cublasv2handle);
162:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
163:   if (xiscuda) {
164:     PetscBLASIntCast(yin->map->n,&bn);
165:     VecCUDAGetArrayRead(xin,&xarray);
166:     VecCUDAGetArray(yin,&yarray);
167:     PetscLogGpuTimeBegin();
168:     cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
169:     PetscLogGpuTimeEnd();
170:     VecCUDARestoreArrayRead(xin,&xarray);
171:     VecCUDARestoreArray(yin,&yarray);
172:     PetscLogGpuFlops(2.0*yin->map->n);
173:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
174:   } else {
175:     VecAXPY_Seq(yin,alpha,xin);
176:   }
177:   return 0;
178: }

180: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
181: {
182:   PetscInt                              n = xin->map->n;
183:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
184:   PetscScalar                           *warray=NULL;
185:   thrust::device_ptr<const PetscScalar> xptr,yptr;
186:   thrust::device_ptr<PetscScalar>       wptr;

188:   if (xin->boundtocpu || yin->boundtocpu) {
189:     VecPointwiseDivide_Seq(win,xin,yin);
190:     return 0;
191:   }
192:   VecCUDAGetArrayWrite(win,&warray);
193:   VecCUDAGetArrayRead(xin,&xarray);
194:   VecCUDAGetArrayRead(yin,&yarray);
195:   PetscLogGpuTimeBegin();
196:   try {
197:     wptr = thrust::device_pointer_cast(warray);
198:     xptr = thrust::device_pointer_cast(xarray);
199:     yptr = thrust::device_pointer_cast(yarray);
200:     thrust::transform(VecCUDAThrustPolicy(win),xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
201:   } catch (char *ex) {
202:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
203:   }
204:   PetscLogGpuTimeEnd();
205:   PetscLogGpuFlops(n);
206:   VecCUDARestoreArrayRead(xin,&xarray);
207:   VecCUDARestoreArrayRead(yin,&yarray);
208:   VecCUDARestoreArrayWrite(win,&warray);
209:   return 0;
210: }

212: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
213: {
214:   const PetscScalar *xarray=NULL,*yarray=NULL;
215:   PetscScalar       *warray=NULL;
216:   PetscBLASInt      one = 1,bn = 0;
217:   cublasHandle_t    cublasv2handle;
218:   cudaStream_t      stream;

220:   PetscCUBLASGetHandle(&cublasv2handle);
221:   PetscBLASIntCast(win->map->n,&bn);
222:   if (alpha == (PetscScalar)0.0) {
223:     VecCopy_SeqCUDA(yin,win);
224:   } else {
225:     VecCUDAGetArrayRead(xin,&xarray);
226:     VecCUDAGetArrayRead(yin,&yarray);
227:     VecCUDAGetArrayWrite(win,&warray);
228:     PetscLogGpuTimeBegin();
229:     cublasGetStream(cublasv2handle,&stream);
230:     cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);
231:     cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);
232:     PetscLogGpuTimeEnd();
233:     PetscLogGpuFlops(2*win->map->n);
234:     VecCUDARestoreArrayRead(xin,&xarray);
235:     VecCUDARestoreArrayRead(yin,&yarray);
236:     VecCUDARestoreArrayWrite(win,&warray);
237:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
238:   }
239:   return 0;
240: }

242: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
243: {
244:   PetscInt          n = xin->map->n,j;
245:   PetscScalar       *xarray;
246:   const PetscScalar *yarray;
247:   PetscBLASInt      one = 1,bn = 0;
248:   cublasHandle_t    cublasv2handle;

250:   PetscLogGpuFlops(nv*2.0*n);
251:   PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
252:   PetscCUBLASGetHandle(&cublasv2handle);
253:   PetscBLASIntCast(n,&bn);
254:   VecCUDAGetArray(xin,&xarray);
255:   PetscLogGpuTimeBegin();
256:   for (j=0; j<nv; j++) {
257:     VecCUDAGetArrayRead(y[j],&yarray);
258:     cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);
259:     VecCUDARestoreArrayRead(y[j],&yarray);
260:   }
261:   PetscLogGpuTimeEnd();
262:   VecCUDARestoreArray(xin,&xarray);
263:   return 0;
264: }

266: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
267: {
268:   const PetscScalar *xarray,*yarray;
269:   PetscBLASInt      one = 1,bn = 0;
270:   cublasHandle_t    cublasv2handle;

272:   PetscCUBLASGetHandle(&cublasv2handle);
273:   PetscBLASIntCast(yin->map->n,&bn);
274:   VecCUDAGetArrayRead(xin,&xarray);
275:   VecCUDAGetArrayRead(yin,&yarray);
276:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
277:   PetscLogGpuTimeBegin();
278:   cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);
279:   PetscLogGpuTimeEnd();
280:   if (xin->map->n >0) {
281:     PetscLogGpuFlops(2.0*xin->map->n-1);
282:   }
283:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
284:   VecCUDARestoreArrayRead(xin,&xarray);
285:   VecCUDARestoreArrayRead(yin,&yarray);
286:   return 0;
287: }

289: //
290: // CUDA kernels for MDot to follow
291: //

293: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
294: #define MDOT_WORKGROUP_SIZE 128
295: #define MDOT_WORKGROUP_NUM  128

297: #if !defined(PETSC_USE_COMPLEX)
298: // M = 2:
299: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
300:                                         PetscInt size, PetscScalar *group_results)
301: {
302:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
303:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
304:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
305:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
306:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

308:   PetscScalar entry_x    = 0;
309:   PetscScalar group_sum0 = 0;
310:   PetscScalar group_sum1 = 0;
311:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
312:     entry_x     = x[i];   // load only once from global memory!
313:     group_sum0 += entry_x * y0[i];
314:     group_sum1 += entry_x * y1[i];
315:   }
316:   tmp_buffer[threadIdx.x]                       = group_sum0;
317:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

319:   // parallel reduction
320:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
321:     __syncthreads();
322:     if (threadIdx.x < stride) {
323:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
324:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
325:     }
326:   }

328:   // write result of group to group_results
329:   if (threadIdx.x == 0) {
330:     group_results[blockIdx.x]             = tmp_buffer[0];
331:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
332:   }
333: }

335: // M = 3:
336: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
337:                                         PetscInt size, PetscScalar *group_results)
338: {
339:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
340:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
341:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
342:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
343:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

345:   PetscScalar entry_x    = 0;
346:   PetscScalar group_sum0 = 0;
347:   PetscScalar group_sum1 = 0;
348:   PetscScalar group_sum2 = 0;
349:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
350:     entry_x     = x[i];   // load only once from global memory!
351:     group_sum0 += entry_x * y0[i];
352:     group_sum1 += entry_x * y1[i];
353:     group_sum2 += entry_x * y2[i];
354:   }
355:   tmp_buffer[threadIdx.x]                           = group_sum0;
356:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
357:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

359:   // parallel reduction
360:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
361:     __syncthreads();
362:     if (threadIdx.x < stride) {
363:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
364:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
365:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
366:     }
367:   }

369:   // write result of group to group_results
370:   if (threadIdx.x == 0) {
371:     group_results[blockIdx.x                ] = tmp_buffer[0];
372:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
373:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
374:   }
375: }

377: // M = 4:
378: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
379:                                         PetscInt size, PetscScalar *group_results)
380: {
381:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
382:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
383:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
384:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
385:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

387:   PetscScalar entry_x    = 0;
388:   PetscScalar group_sum0 = 0;
389:   PetscScalar group_sum1 = 0;
390:   PetscScalar group_sum2 = 0;
391:   PetscScalar group_sum3 = 0;
392:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
393:     entry_x     = x[i];   // load only once from global memory!
394:     group_sum0 += entry_x * y0[i];
395:     group_sum1 += entry_x * y1[i];
396:     group_sum2 += entry_x * y2[i];
397:     group_sum3 += entry_x * y3[i];
398:   }
399:   tmp_buffer[threadIdx.x]                           = group_sum0;
400:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
401:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
402:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

404:   // parallel reduction
405:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
406:     __syncthreads();
407:     if (threadIdx.x < stride) {
408:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
409:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
410:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
411:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
412:     }
413:   }

415:   // write result of group to group_results
416:   if (threadIdx.x == 0) {
417:     group_results[blockIdx.x                ] = tmp_buffer[0];
418:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
419:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
420:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
421:   }
422: }

424: // M = 8:
425: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
426:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
427:                                           PetscInt size, PetscScalar *group_results)
428: {
429:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
430:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
431:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
432:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
433:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

435:   PetscScalar entry_x    = 0;
436:   PetscScalar group_sum0 = 0;
437:   PetscScalar group_sum1 = 0;
438:   PetscScalar group_sum2 = 0;
439:   PetscScalar group_sum3 = 0;
440:   PetscScalar group_sum4 = 0;
441:   PetscScalar group_sum5 = 0;
442:   PetscScalar group_sum6 = 0;
443:   PetscScalar group_sum7 = 0;
444:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
445:     entry_x     = x[i];   // load only once from global memory!
446:     group_sum0 += entry_x * y0[i];
447:     group_sum1 += entry_x * y1[i];
448:     group_sum2 += entry_x * y2[i];
449:     group_sum3 += entry_x * y3[i];
450:     group_sum4 += entry_x * y4[i];
451:     group_sum5 += entry_x * y5[i];
452:     group_sum6 += entry_x * y6[i];
453:     group_sum7 += entry_x * y7[i];
454:   }
455:   tmp_buffer[threadIdx.x]                           = group_sum0;
456:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
457:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
458:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
459:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
460:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
461:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
462:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

464:   // parallel reduction
465:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
466:     __syncthreads();
467:     if (threadIdx.x < stride) {
468:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
469:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
470:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
471:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
472:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
473:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
474:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
475:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
476:     }
477:   }

479:   // write result of group to group_results
480:   if (threadIdx.x == 0) {
481:     group_results[blockIdx.x                ] = tmp_buffer[0];
482:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
483:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
484:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
485:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
486:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
487:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
488:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
489:   }
490: }
491: #endif /* !defined(PETSC_USE_COMPLEX) */

493: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
494: {
495:   PetscInt          i,n = xin->map->n,current_y_index = 0;
496:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
497: #if !defined(PETSC_USE_COMPLEX)
498:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
499:   PetscScalar       *group_results_gpu,*group_results_cpu;
500: #endif
501:   PetscBLASInt      one = 1,bn = 0;
502:   cublasHandle_t    cublasv2handle;

504:   PetscCUBLASGetHandle(&cublasv2handle);
505:   PetscBLASIntCast(xin->map->n,&bn);
507:   /* Handle the case of local size zero first */
508:   if (!xin->map->n) {
509:     for (i=0; i<nv; ++i) z[i] = 0;
510:     return 0;
511:   }

513: #if !defined(PETSC_USE_COMPLEX)
514:   // allocate scratchpad memory for the results of individual work groups:
515:   cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
516:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
517: #endif
518:   VecCUDAGetArrayRead(xin,&xptr);
519:   PetscLogGpuTimeBegin();

521:   while (current_y_index < nv)
522:   {
523:     switch (nv - current_y_index) {

525:       case 7:
526:       case 6:
527:       case 5:
528:       case 4:
529:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
530:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
531:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
532:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
533: #if defined(PETSC_USE_COMPLEX)
534:         cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
535:         cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
536:         cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
537:         cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
538: #else
539:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
540: #endif
541:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
542:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
543:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
544:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
545:         current_y_index += 4;
546:         break;

548:       case 3:
549:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
550:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
551:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

553: #if defined(PETSC_USE_COMPLEX)
554:         cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
555:         cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
556:         cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
557: #else
558:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
559: #endif
560:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
561:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
562:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
563:         current_y_index += 3;
564:         break;

566:       case 2:
567:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
568:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
569: #if defined(PETSC_USE_COMPLEX)
570:         cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
571:         cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
572: #else
573:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
574: #endif
575:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
576:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
577:         current_y_index += 2;
578:         break;

580:       case 1:
581:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
582:         cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
583:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
584:         current_y_index += 1;
585:         break;

587:       default: // 8 or more vectors left
588:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
589:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
590:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
591:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
592:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
593:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
594:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
595:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
596: #if defined(PETSC_USE_COMPLEX)
597:         cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
598:         cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
599:         cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
600:         cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
601:         cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);
602:         cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);
603:         cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);
604:         cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);
605: #else
606:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
607: #endif
608:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
609:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
610:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
611:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
612:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
613:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
614:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
615:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
616:         current_y_index += 8;
617:         break;
618:     }
619:   }
620:   PetscLogGpuTimeEnd();
621:   VecCUDARestoreArrayRead(xin,&xptr);

623: #if defined(PETSC_USE_COMPLEX)
624:   PetscLogGpuToCpuScalar(nv*sizeof(PetscScalar));
625: #else
626:   // copy results to CPU
627:   cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);

629:   // sum group results into z
630:   for (j=0; j<nv1; ++j) {
631:     z[j] = 0;
632:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
633:   }
634:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
635:   cudaFree(group_results_gpu);
636:   PetscFree(group_results_cpu);
637:   PetscLogGpuToCpuScalar(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
638: #endif
639:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
640:   return 0;
641: }

643: #undef MDOT_WORKGROUP_SIZE
644: #undef MDOT_WORKGROUP_NUM

646: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
647: {
648:   PetscInt                        n = xin->map->n;
649:   PetscScalar                     *xarray = NULL;
650:   thrust::device_ptr<PetscScalar> xptr;

652:   VecCUDAGetArrayWrite(xin,&xarray);
653:   PetscLogGpuTimeBegin();
654:   if (alpha == (PetscScalar)0.0) {
655:     cudaMemset(xarray,0,n*sizeof(PetscScalar));
656:   } else {
657:     try {
658:       xptr = thrust::device_pointer_cast(xarray);
659:       thrust::fill(xptr,xptr+n,alpha);
660:     } catch (char *ex) {
661:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
662:     }
663:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
664:   }
665:   PetscLogGpuTimeEnd();
666:   VecCUDARestoreArrayWrite(xin,&xarray);
667:   return 0;
668: }

670: struct PetscScalarReciprocal
671: {
672:   __host__ __device__
673:   PetscScalar operator()(const PetscScalar& s)
674:   {
675:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
676:   }
677: };

679: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
680: {
681:   PetscInt       n;
682:   PetscScalar    *x;

684:   VecGetLocalSize(v,&n);
685:   VecCUDAGetArray(v,&x);
686:   PetscLogGpuTimeBegin();
687:   try {
688:     auto xptr = thrust::device_pointer_cast(x);
689:     thrust::transform(VecCUDAThrustPolicy(v),xptr,xptr+n,xptr,PetscScalarReciprocal());
690:   } catch (char *ex) {
691:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
692:   }
693:   PetscLogGpuTimeEnd();
694:   VecCUDARestoreArray(v,&x);
695:   return 0;
696: }

698: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
699: {
700:   PetscScalar    *xarray;
701:   PetscBLASInt   one = 1,bn = 0;
702:   cublasHandle_t cublasv2handle;

704:   if (alpha == (PetscScalar)0.0) {
705:     VecSet_SeqCUDA(xin,alpha);
706:   } else if (alpha != (PetscScalar)1.0) {
707:     PetscCUBLASGetHandle(&cublasv2handle);
708:     PetscBLASIntCast(xin->map->n,&bn);
709:     VecCUDAGetArray(xin,&xarray);
710:     PetscLogGpuTimeBegin();
711:     cublasXscal(cublasv2handle,bn,&alpha,xarray,one);
712:     VecCUDARestoreArray(xin,&xarray);
713:     PetscLogGpuTimeEnd();
714:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
715:     PetscLogGpuFlops(xin->map->n);
716:   }
717:   return 0;
718: }

720: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
721: {
722:   const PetscScalar *xarray,*yarray;
723:   PetscBLASInt      one = 1,bn = 0;
724:   cublasHandle_t    cublasv2handle;

726:   PetscCUBLASGetHandle(&cublasv2handle);
727:   PetscBLASIntCast(xin->map->n,&bn);
728:   VecCUDAGetArrayRead(xin,&xarray);
729:   VecCUDAGetArrayRead(yin,&yarray);
730:   PetscLogGpuTimeBegin();
731:   cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);
732:   PetscLogGpuTimeEnd();
733:   if (xin->map->n > 0) {
734:     PetscLogGpuFlops(2.0*xin->map->n-1);
735:   }
736:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
737:   VecCUDARestoreArrayRead(yin,&yarray);
738:   VecCUDARestoreArrayRead(xin,&xarray);
739:   return 0;
740: }

742: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
743: {
744:   const PetscScalar *xarray;
745:   PetscScalar       *yarray;

747:   if (xin != yin) {
748:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
749:       PetscBool yiscuda;

751:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
752:       VecCUDAGetArrayRead(xin,&xarray);
753:       if (yiscuda) {
754:         VecCUDAGetArrayWrite(yin,&yarray);
755:       } else {
756:         VecGetArrayWrite(yin,&yarray);
757:       }
758:       PetscLogGpuTimeBegin();
759:       if (yiscuda) {
760:         cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
761:       } else {
762:         cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
763:         PetscLogGpuToCpu((yin->map->n)*sizeof(PetscScalar));
764:       }
765:       PetscLogGpuTimeEnd();
766:       VecCUDARestoreArrayRead(xin,&xarray);
767:       if (yiscuda) {
768:         VecCUDARestoreArrayWrite(yin,&yarray);
769:       } else {
770:         VecRestoreArrayWrite(yin,&yarray);
771:       }
772:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
773:       /* copy in CPU if we are on the CPU */
774:       VecCopy_SeqCUDA_Private(xin,yin);
775:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
776:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
777:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
778:         /* copy in CPU */
779:         VecCopy_SeqCUDA_Private(xin,yin);
780:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
781:         /* copy in GPU */
782:         VecCUDAGetArrayRead(xin,&xarray);
783:         VecCUDAGetArrayWrite(yin,&yarray);
784:         PetscLogGpuTimeBegin();
785:         cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
786:         PetscLogGpuTimeEnd();
787:         VecCUDARestoreArrayRead(xin,&xarray);
788:         VecCUDARestoreArrayWrite(yin,&yarray);
789:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
790:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
791:            default to copy in GPU (this is an arbitrary choice) */
792:         VecCUDAGetArrayRead(xin,&xarray);
793:         VecCUDAGetArrayWrite(yin,&yarray);
794:         PetscLogGpuTimeBegin();
795:         cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);
796:         PetscLogGpuTimeEnd();
797:         VecCUDARestoreArrayRead(xin,&xarray);
798:         VecCUDARestoreArrayWrite(yin,&yarray);
799:       } else {
800:         VecCopy_SeqCUDA_Private(xin,yin);
801:       }
802:     }
803:   }
804:   return 0;
805: }

807: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
808: {
809:   PetscBLASInt   one = 1,bn = 0;
810:   PetscScalar    *xarray,*yarray;
811:   cublasHandle_t cublasv2handle;

813:   PetscCUBLASGetHandle(&cublasv2handle);
814:   PetscBLASIntCast(xin->map->n,&bn);
815:   if (xin != yin) {
816:     VecCUDAGetArray(xin,&xarray);
817:     VecCUDAGetArray(yin,&yarray);
818:     PetscLogGpuTimeBegin();
819:     cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);
820:     PetscLogGpuTimeEnd();
821:     VecCUDARestoreArray(xin,&xarray);
822:     VecCUDARestoreArray(yin,&yarray);
823:   }
824:   return 0;
825: }

827: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
828: {
829:   PetscScalar       a = alpha,b = beta;
830:   const PetscScalar *xarray;
831:   PetscScalar       *yarray;
832:   PetscBLASInt      one = 1, bn = 0;
833:   cublasHandle_t    cublasv2handle;

835:   PetscCUBLASGetHandle(&cublasv2handle);
836:   PetscBLASIntCast(yin->map->n,&bn);
837:   if (a == (PetscScalar)0.0) {
838:     VecScale_SeqCUDA(yin,beta);
839:   } else if (b == (PetscScalar)1.0) {
840:     VecAXPY_SeqCUDA(yin,alpha,xin);
841:   } else if (a == (PetscScalar)1.0) {
842:     VecAYPX_SeqCUDA(yin,beta,xin);
843:   } else if (b == (PetscScalar)0.0) {
844:     VecCUDAGetArrayRead(xin,&xarray);
845:     VecCUDAGetArray(yin,&yarray);
846:     PetscLogGpuTimeBegin();
847:     cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
848:     cublasXscal(cublasv2handle,bn,&alpha,yarray,one);
849:     PetscLogGpuTimeEnd();
850:     PetscLogGpuFlops(xin->map->n);
851:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
852:     VecCUDARestoreArrayRead(xin,&xarray);
853:     VecCUDARestoreArray(yin,&yarray);
854:   } else {
855:     VecCUDAGetArrayRead(xin,&xarray);
856:     VecCUDAGetArray(yin,&yarray);
857:     PetscLogGpuTimeBegin();
858:     cublasXscal(cublasv2handle,bn,&beta,yarray,one);
859:     cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);
860:     VecCUDARestoreArrayRead(xin,&xarray);
861:     VecCUDARestoreArray(yin,&yarray);
862:     PetscLogGpuTimeEnd();
863:     PetscLogGpuFlops(3.0*xin->map->n);
864:     PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
865:   }
866:   return 0;
867: }

869: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
870: {
871:   PetscInt       n = zin->map->n;

873:   if (gamma == (PetscScalar)1.0) {
874:     /* z = ax + b*y + z */
875:     VecAXPY_SeqCUDA(zin,alpha,xin);
876:     VecAXPY_SeqCUDA(zin,beta,yin);
877:     PetscLogGpuFlops(4.0*n);
878:   } else {
879:     /* z = a*x + b*y + c*z */
880:     VecScale_SeqCUDA(zin,gamma);
881:     VecAXPY_SeqCUDA(zin,alpha,xin);
882:     VecAXPY_SeqCUDA(zin,beta,yin);
883:     PetscLogGpuFlops(5.0*n);
884:   }
885:   return 0;
886: }

888: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
889: {
890:   PetscInt                              n = win->map->n;
891:   const PetscScalar                     *xarray,*yarray;
892:   PetscScalar                           *warray;
893:   thrust::device_ptr<const PetscScalar> xptr,yptr;
894:   thrust::device_ptr<PetscScalar>       wptr;

896:   if (xin->boundtocpu || yin->boundtocpu) {
897:     VecPointwiseMult_Seq(win,xin,yin);
898:     return 0;
899:   }
900:   VecCUDAGetArrayRead(xin,&xarray);
901:   VecCUDAGetArrayRead(yin,&yarray);
902:   VecCUDAGetArrayWrite(win,&warray);
903:   PetscLogGpuTimeBegin();
904:   try {
905:     wptr = thrust::device_pointer_cast(warray);
906:     xptr = thrust::device_pointer_cast(xarray);
907:     yptr = thrust::device_pointer_cast(yarray);
908:     thrust::transform(VecCUDAThrustPolicy(win),xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
909:   } catch (char *ex) {
910:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
911:   }
912:   PetscLogGpuTimeEnd();
913:   VecCUDARestoreArrayRead(xin,&xarray);
914:   VecCUDARestoreArrayRead(yin,&yarray);
915:   VecCUDARestoreArrayWrite(win,&warray);
916:   PetscLogGpuFlops(n);
917:   return 0;
918: }

920: /* should do infinity norm in cuda */

922: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
923: {
924:   PetscInt          n = xin->map->n;
925:   PetscBLASInt      one = 1, bn = 0;
926:   const PetscScalar *xarray;
927:   cublasHandle_t    cublasv2handle;

929:   PetscCUBLASGetHandle(&cublasv2handle);
930:   PetscBLASIntCast(n,&bn);
931:   if (type == NORM_2 || type == NORM_FROBENIUS) {
932:     VecCUDAGetArrayRead(xin,&xarray);
933:     PetscLogGpuTimeBegin();
934:     cublasXnrm2(cublasv2handle,bn,xarray,one,z);
935:     PetscLogGpuTimeEnd();
936:     VecCUDARestoreArrayRead(xin,&xarray);
937:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
938:   } else if (type == NORM_INFINITY) {
939:     int  i;
940:     VecCUDAGetArrayRead(xin,&xarray);
941:     PetscLogGpuTimeBegin();
942:     cublasIXamax(cublasv2handle,bn,xarray,one,&i);
943:     PetscLogGpuTimeEnd();
944:     if (bn) {
945:       PetscScalar zs;
946:       cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);
947:       *z = PetscAbsScalar(zs);
948:     } else *z = 0.0;
949:     VecCUDARestoreArrayRead(xin,&xarray);
950:   } else if (type == NORM_1) {
951:     VecCUDAGetArrayRead(xin,&xarray);
952:     PetscLogGpuTimeBegin();
953:     cublasXasum(cublasv2handle,bn,xarray,one,z);
954:     VecCUDARestoreArrayRead(xin,&xarray);
955:     PetscLogGpuTimeEnd();
956:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
957:   } else if (type == NORM_1_AND_2) {
958:     VecNorm_SeqCUDA(xin,NORM_1,z);
959:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
960:   }
961:   PetscLogGpuToCpuScalar(sizeof(PetscReal));
962:   return 0;
963: }

965: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
966: {
967:   VecDot_SeqCUDA(s,t,dp);
968:   VecDot_SeqCUDA(t,t,nm);
969:   return 0;
970: }

972: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
973: {
974:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;

976:   if (v->spptr) {
977:     if (veccuda->GPUarray_allocated) {
978:      #if defined(PETSC_HAVE_NVSHMEM)
979:       if (veccuda->nvshmem) {
980:         PetscNvshmemFree(veccuda->GPUarray_allocated);
981:         veccuda->nvshmem = PETSC_FALSE;
982:       }
983:       else
984:      #endif
985:       cudaFree(veccuda->GPUarray_allocated);
986:       veccuda->GPUarray_allocated = NULL;
987:     }
988:     if (veccuda->stream) {
989:       cudaStreamDestroy(veccuda->stream);
990:     }
991:   }
992:   VecDestroy_SeqCUDA_Private(v);
993:   PetscFree(v->spptr);
994:   return 0;
995: }

997: #if defined(PETSC_USE_COMPLEX)
998: struct conjugate
999: {
1000:   __host__ __device__
1001:     PetscScalar operator()(const PetscScalar& x)
1002:     {
1003:       return PetscConj(x);
1004:     }
1005: };
1006: #endif

1008: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1009: {
1010: #if defined(PETSC_USE_COMPLEX)
1011:   PetscScalar                     *xarray;
1012:   PetscInt                        n = xin->map->n;
1013:   thrust::device_ptr<PetscScalar> xptr;

1015:   VecCUDAGetArray(xin,&xarray);
1016:   PetscLogGpuTimeBegin();
1017:   try {
1018:     xptr = thrust::device_pointer_cast(xarray);
1019:     thrust::transform(VecCUDAThrustPolicy(xin),xptr,xptr+n,xptr,conjugate());
1020:   } catch (char *ex) {
1021:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1022:   }
1023:   PetscLogGpuTimeEnd();
1024:   VecCUDARestoreArray(xin,&xarray);
1025: #else
1026: #endif
1027:   return 0;
1028: }

1030: static inline PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1031: {
1032:   PetscBool      wisseqcuda;

1037:   PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1038:   if (w->data && wisseqcuda) {
1039:     if (((Vec_Seq*)w->data)->array_allocated) {
1040:       if (w->pinned_memory) {
1041:         PetscMallocSetCUDAHost();
1042:       }
1043:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1044:       if (w->pinned_memory) {
1045:         PetscMallocResetCUDAHost();
1046:         w->pinned_memory = PETSC_FALSE;
1047:       }
1048:     }
1049:     ((Vec_Seq*)w->data)->array = NULL;
1050:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1051:   }
1052:   if (w->spptr && wisseqcuda) {
1053:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1054:       cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);
1055:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1056:     }
1057:     if (((Vec_CUDA*)w->spptr)->stream) {
1058:       cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);
1059:     }
1060:     PetscFree(w->spptr);
1061:   }

1063:   if (v->petscnative && wisseqcuda) {
1064:     PetscFree(w->data);
1065:     w->data = v->data;
1066:     w->offloadmask = v->offloadmask;
1067:     w->pinned_memory = v->pinned_memory;
1068:     w->spptr = v->spptr;
1069:     PetscObjectStateIncrease((PetscObject)w);
1070:   } else {
1071:     if (read) {
1072:       VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1073:     } else {
1074:       VecGetArray(v,&((Vec_Seq*)w->data)->array);
1075:     }
1076:     w->offloadmask = PETSC_OFFLOAD_CPU;
1077:     if (wisseqcuda) {
1078:       VecCUDAAllocateCheck(w);
1079:     }
1080:   }
1081:   return 0;
1082: }

1084: static inline PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1085: {
1086:   PetscBool      wisseqcuda;

1091:   PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1092:   if (v->petscnative && wisseqcuda) {
1093:     v->data = w->data;
1094:     v->offloadmask = w->offloadmask;
1095:     v->pinned_memory = w->pinned_memory;
1096:     v->spptr = w->spptr;
1097:     w->data = 0;
1098:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1099:     w->spptr = 0;
1100:   } else {
1101:     if (read) {
1102:       VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1103:     } else {
1104:       VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1105:     }
1106:     if ((Vec_CUDA*)w->spptr && wisseqcuda) {
1107:       cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);
1108:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1109:       if (((Vec_CUDA*)v->spptr)->stream) {
1110:         cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);
1111:       }
1112:       PetscFree(w->spptr);
1113:     }
1114:   }
1115:   return 0;
1116: }

1118: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1119: {
1120:   VecGetLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1121:   return 0;
1122: }

1124: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v,Vec w)
1125: {
1126:   VecGetLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1127:   return 0;
1128: }

1130: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1131: {
1132:   VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1133:   return 0;
1134: }

1136: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v,Vec w)
1137: {
1138:   VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1139:   return 0;
1140: }

1142: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1143: {
1144:   __host__ __device__
1145:   PetscReal operator()(const PetscScalar& x) {
1146:     return PetscRealPart(x);
1147:   }
1148: };

1150: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1151: {
1152:   __host__ __device__
1153:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1154:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1155:   }
1156: };

1158: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1159: {
1160:   __host__ __device__
1161:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1162:     return x < y ? y : x;
1163:   }
1164: };

1166: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1167: {
1168:   __host__ __device__
1169:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1170:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1171:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1172:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1173:   }
1174: };

1176: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1177: {
1178:   __host__ __device__
1179:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1180:     return x < y ? x : y;
1181:   }
1182: };

1184: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1185: {
1186:   __host__ __device__
1187:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1188:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1189:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1190:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1191:   }
1192: };

1194: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1195: {
1196:   PetscInt                              n = v->map->n;
1197:   const PetscScalar                     *av;
1198:   thrust::device_ptr<const PetscScalar> avpt;

1201:   if (!n) {
1202:     *m = PETSC_MIN_REAL;
1203:     if (p) *p = -1;
1204:     return 0;
1205:   }
1206:   VecCUDAGetArrayRead(v,&av);
1207:   avpt = thrust::device_pointer_cast(av);
1208:   PetscLogGpuTimeBegin();
1209:   if (p) {
1210:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1211:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1212:     try {
1213: #if defined(PETSC_USE_COMPLEX)
1214:       res = thrust::transform_reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,petscrealparti(),res,petscmaxi());
1215: #else
1216:       res = thrust::reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,res,petscmaxi());
1217: #endif
1218:     } catch (char *ex) {
1219:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1220:     }
1221:     *m = res.get<0>();
1222:     *p = res.get<1>();
1223:   } else {
1224:     try {
1225: #if defined(PETSC_USE_COMPLEX)
1226:       *m = thrust::transform_reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1227: #else
1228:       *m = thrust::reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1229: #endif
1230:     } catch (char *ex) {
1231:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1232:     }
1233:   }
1234:   PetscLogGpuTimeEnd();
1235:   VecCUDARestoreArrayRead(v,&av);
1236:   return 0;
1237: }

1239: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1240: {
1241:   PetscInt                              n = v->map->n;
1242:   const PetscScalar                     *av;
1243:   thrust::device_ptr<const PetscScalar> avpt;

1246:   if (!n) {
1247:     *m = PETSC_MAX_REAL;
1248:     if (p) *p = -1;
1249:     return 0;
1250:   }
1251:   VecCUDAGetArrayRead(v,&av);
1252:   avpt = thrust::device_pointer_cast(av);
1253:   PetscLogGpuTimeBegin();
1254:   if (p) {
1255:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1256:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1257:     try {
1258: #if defined(PETSC_USE_COMPLEX)
1259:       res = thrust::transform_reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,petscrealparti(),res,petscmini());
1260: #else
1261:       res = thrust::reduce(VecCUDAThrustPolicy(v),zibit,zibit+n,res,petscmini());
1262: #endif
1263:     } catch (char *ex) {
1264:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1265:     }
1266:     *m = res.get<0>();
1267:     *p = res.get<1>();
1268:   } else {
1269:     try {
1270: #if defined(PETSC_USE_COMPLEX)
1271:       *m = thrust::transform_reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1272: #else
1273:       *m = thrust::reduce(VecCUDAThrustPolicy(v),avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1274: #endif
1275:     } catch (char *ex) {
1276:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1277:     }
1278:   }
1279:   PetscLogGpuTimeEnd();
1280:   VecCUDARestoreArrayRead(v,&av);
1281:   return 0;
1282: }

1284: PetscErrorCode VecSum_SeqCUDA(Vec v,PetscScalar *sum)
1285: {
1286:   PetscInt                              n = v->map->n;
1287:   const PetscScalar                     *a;
1288:   thrust::device_ptr<const PetscScalar> dptr;

1291:   VecCUDAGetArrayRead(v,&a);
1292:   dptr = thrust::device_pointer_cast(a);
1293:   PetscLogGpuTimeBegin();
1294:   try {
1295:     *sum = thrust::reduce(VecCUDAThrustPolicy(v),dptr,dptr+n,PetscScalar(0.0));
1296:   } catch (char *ex) {
1297:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1298:   }
1299:   PetscLogGpuTimeEnd();
1300:   VecCUDARestoreArrayRead(v,&a);
1301:   return 0;
1302: }

1304: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1305: {
1306:   const PetscScalar shift_;
1307:   petscshift(PetscScalar shift) : shift_(shift){}
1308:   __host__ __device__
1309:   PetscScalar operator()(PetscScalar x) {return x + shift_;}
1310: };

1312: PetscErrorCode VecShift_SeqCUDA(Vec v,PetscScalar shift)
1313: {
1314:   PetscInt                              n = v->map->n;
1315:   PetscScalar                           *a;
1316:   thrust::device_ptr<PetscScalar>       dptr;

1319:   VecCUDAGetArray(v,&a);
1320:   dptr = thrust::device_pointer_cast(a);
1321:   PetscLogGpuTimeBegin();
1322:   try {
1323:     thrust::transform(VecCUDAThrustPolicy(v),dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1324:   } catch (char *ex) {
1325:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1326:   }
1327:   PetscLogGpuTimeEnd();
1328:   VecCUDARestoreArray(v,&a);
1329:   return 0;
1330: }

1332: #if defined(PETSC_HAVE_NVSHMEM)
1333: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1334:    New array does not retain values in the old array. The offload mask is not changed.

1336:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1337:  */
1338: PetscErrorCode  VecAllocateNVSHMEM_SeqCUDA(Vec v)
1339: {
1340:   cudaError_t    cerr;
1341:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;
1342:   PetscInt       n;

1344:   cudaFree(veccuda->GPUarray_allocated);
1345:   VecGetLocalSize(v,&n);
1346:   MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1347:   PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1348:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1349:   veccuda->nvshmem  = PETSC_TRUE;
1350:   return 0;
1351: }
1352: #endif