Actual source code: veccuda2.cu

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUDAFlag for the vector
 20:     Does NOT zero the CUDA array

 22:  */
 23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 24: {
 26:   cudaError_t    err;
 27:   cudaStream_t   stream;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 37:     veccuda->stream = stream;
 38:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->valid_GPU_array = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:   }
 47:   return(0);
 48: }

 50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 51: PetscErrorCode VecCUDACopyToGPU(Vec v)
 52: {
 54:   cudaError_t    err;
 55:   Vec_CUDA       *veccuda;
 56:   PetscScalar    *varray;

 60:   VecCUDAAllocateCheck(v);
 61:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 62:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 63:     veccuda=(Vec_CUDA*)v->spptr;
 64:     varray=veccuda->GPUarray;
 65:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 66:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 67:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 68:   }
 69:   return(0);
 70: }

 72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
 73: {
 74:   PetscScalar    *varray;
 76:   cudaError_t    err;
 77:   PetscScalar    *cpuPtr, *gpuPtr;
 78:   Vec_Seq        *s;
 79:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

 83:   VecCUDAAllocateCheck(v);
 84:   if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
 85:     s = (Vec_Seq*)v->data;

 87:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 88:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 89:     gpuPtr = varray + ptop_scatter->recvLowestIndex;
 90:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

 92:     /* Note : this code copies the smallest contiguous chunk of data
 93:        containing ALL of the indices */
 94:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

 96:     // Set the buffer states
 97:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
 98:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
 99:   }
100:   return(0);
101: }


104: /*
105:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
106: */
107: PetscErrorCode VecCUDACopyFromGPU(Vec v)
108: {
110:   cudaError_t    err;
111:   Vec_CUDA       *veccuda;
112:   PetscScalar    *varray;

116:   VecCUDAAllocateCheckHost(v);
117:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
118:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
119:     veccuda=(Vec_CUDA*)v->spptr;
120:     varray=veccuda->GPUarray;
121:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
122:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
123:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
124:   }
125:   return(0);
126: }

128: /* Note that this function only copies *some* of the values up from the GPU to CPU,
129:    which means that we need recombine the data at some point before using any of the standard functions.
130:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
131:    where you have to always call in pairs
132: */
133: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
134: {
135:   const PetscScalar *varray, *gpuPtr;
136:   PetscErrorCode    ierr;
137:   cudaError_t       err;
138:   PetscScalar       *cpuPtr;
139:   Vec_Seq           *s;
140:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

144:   VecCUDAAllocateCheckHost(v);
145:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
146:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);

148:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
149:     s = (Vec_Seq*)v->data;
150:     gpuPtr = varray + ptop_scatter->sendLowestIndex;
151:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

153:     /* Note : this code copies the smallest contiguous chunk of data
154:        containing ALL of the indices */
155:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);

157:     VecCUDARestoreArrayRead(v,&varray);
158:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
159:   }
160:   return(0);
161: }

163: /*MC
164:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

166:    Options Database Keys:
167: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

169:   Level: beginner

171: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
172: M*/

174: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
175: {
176:   const PetscScalar *xarray;
177:   PetscScalar       *yarray;
178:   PetscErrorCode    ierr;
179:   PetscBLASInt      one=1,bn;
180:   PetscScalar       sone=1.0;
181:   cublasHandle_t    cublasv2handle;
182:   cublasStatus_t    cberr;
183:   cudaError_t       err;

186:   PetscCUBLASGetHandle(&cublasv2handle);
187:   PetscBLASIntCast(yin->map->n,&bn);
188:   VecCUDAGetArrayRead(xin,&xarray);
189:   VecCUDAGetArrayReadWrite(yin,&yarray);
190:   if (alpha == (PetscScalar)0.0) {
191:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
192:   } else if (alpha == (PetscScalar)1.0) {
193:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
194:     PetscLogFlops(2.0*yin->map->n);
195:   } else {
196:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
197:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
198:     PetscLogFlops(2.0*yin->map->n);
199:   }
200:   WaitForGPU();CHKERRCUDA(ierr);
201:   VecCUDARestoreArrayRead(xin,&xarray);
202:   VecCUDARestoreArrayReadWrite(yin,&yarray);
203:   return(0);
204: }

206: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
207: {
208:   const PetscScalar *xarray;
209:   PetscScalar       *yarray;
210:   PetscErrorCode    ierr;
211:   PetscBLASInt      one=1,bn;
212:   cublasHandle_t    cublasv2handle;
213:   cublasStatus_t    cberr;

216:   PetscCUBLASGetHandle(&cublasv2handle);
217:   if (alpha != (PetscScalar)0.0) {
218:     PetscBLASIntCast(yin->map->n,&bn);
219:     VecCUDAGetArrayRead(xin,&xarray);
220:     VecCUDAGetArrayReadWrite(yin,&yarray);
221:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222:     WaitForGPU();CHKERRCUDA(ierr);
223:     VecCUDARestoreArrayRead(xin,&xarray);
224:     VecCUDARestoreArrayReadWrite(yin,&yarray);
225:     PetscLogFlops(2.0*yin->map->n);
226:   }
227:   return(0);
228: }

230: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
231: {
232:   PetscInt                              n = xin->map->n;
233:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
234:   PetscScalar                           *warray=NULL;
235:   thrust::device_ptr<const PetscScalar> xptr,yptr;
236:   thrust::device_ptr<PetscScalar>       wptr;
237:   PetscErrorCode                        ierr;

240:   VecCUDAGetArrayWrite(win,&warray);
241:   VecCUDAGetArrayRead(xin,&xarray);
242:   VecCUDAGetArrayRead(yin,&yarray);
243:   try {
244:     wptr = thrust::device_pointer_cast(warray);
245:     xptr = thrust::device_pointer_cast(xarray);
246:     yptr = thrust::device_pointer_cast(yarray);
247:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
248:     WaitForGPU();CHKERRCUDA(ierr);
249:   } catch (char *ex) {
250:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
251:   }
252:   PetscLogFlops(n);
253:   VecCUDARestoreArrayRead(xin,&xarray);
254:   VecCUDARestoreArrayRead(yin,&yarray);
255:   VecCUDARestoreArrayWrite(win,&warray);
256:   return(0);
257: }

259: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
260: {
261:   const PetscScalar *xarray=NULL,*yarray=NULL;
262:   PetscScalar       *warray=NULL;
263:   PetscErrorCode    ierr;
264:   PetscBLASInt      one=1,bn;
265:   cublasHandle_t    cublasv2handle;
266:   cublasStatus_t    cberr;
267:   cudaError_t       err;

270:   PetscCUBLASGetHandle(&cublasv2handle);
271:   PetscBLASIntCast(win->map->n,&bn);
272:   if (alpha == (PetscScalar)0.0) {
273:     VecCopy_SeqCUDA(yin,win);
274:   } else {
275:     VecCUDAGetArrayRead(xin,&xarray);
276:     VecCUDAGetArrayRead(yin,&yarray);
277:     VecCUDAGetArrayWrite(win,&warray);
278:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
279:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
280:     PetscLogFlops(2*win->map->n);
281:     WaitForGPU();CHKERRCUDA(ierr);
282:     VecCUDARestoreArrayRead(xin,&xarray);
283:     VecCUDARestoreArrayRead(yin,&yarray);
284:     VecCUDARestoreArrayWrite(win,&warray);
285:   }
286:   return(0);
287: }

289: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
290: {
292:   PetscInt       n = xin->map->n,j,j_rem;
293:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

296:   PetscLogFlops(nv*2.0*n);
297:   switch (j_rem=nv&0x3) {
298:     case 3:
299:       alpha0 = alpha[0];
300:       alpha1 = alpha[1];
301:       alpha2 = alpha[2];
302:       alpha += 3;
303:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
304:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
305:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
306:       y   += 3;
307:       break;
308:     case 2:
309:       alpha0 = alpha[0];
310:       alpha1 = alpha[1];
311:       alpha +=2;
312:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
313:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
314:       y +=2;
315:       break;
316:     case 1:
317:       alpha0 = *alpha++;
318:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
319:       y     +=1;
320:       break;
321:   }
322:   for (j=j_rem; j<nv; j+=4) {
323:     alpha0 = alpha[0];
324:     alpha1 = alpha[1];
325:     alpha2 = alpha[2];
326:     alpha3 = alpha[3];
327:     alpha += 4;
328:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
329:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
330:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
331:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
332:     y   += 4;
333:   }
334:   WaitForGPU();CHKERRCUDA(ierr);
335:   return(0);
336: }

338: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
339: {
340:   const PetscScalar *xarray,*yarray;
341:   PetscErrorCode    ierr;
342:   PetscBLASInt      one=1,bn;
343:   cublasHandle_t    cublasv2handle;
344:   cublasStatus_t    cberr;

347:   PetscCUBLASGetHandle(&cublasv2handle);
348:   PetscBLASIntCast(yin->map->n,&bn);
349:   VecCUDAGetArrayRead(xin,&xarray);
350:   VecCUDAGetArrayRead(yin,&yarray);
351:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
352:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
353:   WaitForGPU();CHKERRCUDA(ierr);
354:   if (xin->map->n >0) {
355:     PetscLogFlops(2.0*xin->map->n-1);
356:   }
357:   VecCUDARestoreArrayRead(xin,&xarray);
358:   VecCUDARestoreArrayRead(yin,&yarray);
359:   return(0);
360: }

362: //
363: // CUDA kernels for MDot to follow
364: //

366: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
367: #define MDOT_WORKGROUP_SIZE 128
368: #define MDOT_WORKGROUP_NUM  128

370: #if !defined(PETSC_USE_COMPLEX)
371: // M = 2:
372: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
373:                                         PetscInt size, PetscScalar *group_results)
374: {
375:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
376:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
377:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
378:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
379:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

381:   PetscScalar entry_x    = 0;
382:   PetscScalar group_sum0 = 0;
383:   PetscScalar group_sum1 = 0;
384:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
385:     entry_x     = x[i];   // load only once from global memory!
386:     group_sum0 += entry_x * y0[i];
387:     group_sum1 += entry_x * y1[i];
388:   }
389:   tmp_buffer[threadIdx.x]                       = group_sum0;
390:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

392:   // parallel reduction
393:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
394:     __syncthreads();
395:     if (threadIdx.x < stride) {
396:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
397:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
398:     }
399:   }

401:   // write result of group to group_results
402:   if (threadIdx.x == 0) {
403:     group_results[blockIdx.x]             = tmp_buffer[0];
404:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
405:   }
406: }

408: // M = 3:
409: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
410:                                         PetscInt size, PetscScalar *group_results)
411: {
412:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
413:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
414:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
415:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
416:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

418:   PetscScalar entry_x    = 0;
419:   PetscScalar group_sum0 = 0;
420:   PetscScalar group_sum1 = 0;
421:   PetscScalar group_sum2 = 0;
422:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
423:     entry_x     = x[i];   // load only once from global memory!
424:     group_sum0 += entry_x * y0[i];
425:     group_sum1 += entry_x * y1[i];
426:     group_sum2 += entry_x * y2[i];
427:   }
428:   tmp_buffer[threadIdx.x]                           = group_sum0;
429:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
430:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

432:   // parallel reduction
433:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
434:     __syncthreads();
435:     if (threadIdx.x < stride) {
436:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
437:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
438:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
439:     }
440:   }

442:   // write result of group to group_results
443:   if (threadIdx.x == 0) {
444:     group_results[blockIdx.x                ] = tmp_buffer[0];
445:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
446:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
447:   }
448: }

450: // M = 4:
451: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
452:                                         PetscInt size, PetscScalar *group_results)
453: {
454:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
455:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
456:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
457:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
458:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

460:   PetscScalar entry_x    = 0;
461:   PetscScalar group_sum0 = 0;
462:   PetscScalar group_sum1 = 0;
463:   PetscScalar group_sum2 = 0;
464:   PetscScalar group_sum3 = 0;
465:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
466:     entry_x     = x[i];   // load only once from global memory!
467:     group_sum0 += entry_x * y0[i];
468:     group_sum1 += entry_x * y1[i];
469:     group_sum2 += entry_x * y2[i];
470:     group_sum3 += entry_x * y3[i];
471:   }
472:   tmp_buffer[threadIdx.x]                           = group_sum0;
473:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
474:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
475:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

477:   // parallel reduction
478:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479:     __syncthreads();
480:     if (threadIdx.x < stride) {
481:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
482:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
483:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
485:     }
486:   }

488:   // write result of group to group_results
489:   if (threadIdx.x == 0) {
490:     group_results[blockIdx.x                ] = tmp_buffer[0];
491:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
492:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
493:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
494:   }
495: }

497: // M = 8:
498: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
499:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
500:                                           PetscInt size, PetscScalar *group_results)
501: {
502:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
503:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
504:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
505:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
506:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

508:   PetscScalar entry_x    = 0;
509:   PetscScalar group_sum0 = 0;
510:   PetscScalar group_sum1 = 0;
511:   PetscScalar group_sum2 = 0;
512:   PetscScalar group_sum3 = 0;
513:   PetscScalar group_sum4 = 0;
514:   PetscScalar group_sum5 = 0;
515:   PetscScalar group_sum6 = 0;
516:   PetscScalar group_sum7 = 0;
517:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
518:     entry_x     = x[i];   // load only once from global memory!
519:     group_sum0 += entry_x * y0[i];
520:     group_sum1 += entry_x * y1[i];
521:     group_sum2 += entry_x * y2[i];
522:     group_sum3 += entry_x * y3[i];
523:     group_sum4 += entry_x * y4[i];
524:     group_sum5 += entry_x * y5[i];
525:     group_sum6 += entry_x * y6[i];
526:     group_sum7 += entry_x * y7[i];
527:   }
528:   tmp_buffer[threadIdx.x]                           = group_sum0;
529:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
530:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
531:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
532:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
533:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
534:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
535:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

537:   // parallel reduction
538:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
539:     __syncthreads();
540:     if (threadIdx.x < stride) {
541:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
542:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
543:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
544:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
545:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
546:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
547:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
548:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
549:     }
550:   }

552:   // write result of group to group_results
553:   if (threadIdx.x == 0) {
554:     group_results[blockIdx.x                ] = tmp_buffer[0];
555:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
556:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
557:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
558:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
559:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
560:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
561:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
562:   }
563: }
564: #endif /* !defined(PETSC_USE_COMPLEX) */

566: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
567: {
568:   PetscErrorCode    ierr;
569:   PetscInt          i,n = xin->map->n,current_y_index = 0;
570:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
571:   PetscScalar       *group_results_gpu;
572: #if !defined(PETSC_USE_COMPLEX)
573:   PetscInt          j;
574:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
575: #endif
576:   cudaError_t    cuda_ierr;
577:   PetscBLASInt   one=1,bn;
578:   cublasHandle_t cublasv2handle;
579:   cublasStatus_t cberr;

582:   PetscCUBLASGetHandle(&cublasv2handle);
583:   PetscBLASIntCast(xin->map->n,&bn);
584:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
585:   /* Handle the case of local size zero first */
586:   if (!xin->map->n) {
587:     for (i=0; i<nv; ++i) z[i] = 0;
588:     return(0);
589:   }

591:   // allocate scratchpad memory for the results of individual work groups:
592:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

594:   VecCUDAGetArrayRead(xin,&xptr);

596:   while (current_y_index < nv)
597:   {
598:     switch (nv - current_y_index) {

600:       case 7:
601:       case 6:
602:       case 5:
603:       case 4:
604:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
605:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
606:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
607:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);

609: #if defined(PETSC_USE_COMPLEX)
610:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
611:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
612:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
613:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
614: #else
615:         // run kernel:
616:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

618:         // copy results back to
619:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

621:         // sum group results into z:
622:         for (j=0; j<4; ++j) {
623:           z[current_y_index + j] = 0;
624:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
625:         }
626: #endif
627:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
628:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
629:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
630:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
631:         current_y_index += 4;
632:         break;

634:       case 3:
635:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
636:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
637:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

639: #if defined(PETSC_USE_COMPLEX)
640:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
641:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
642:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
643: #else
644:         // run kernel:
645:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

647:         // copy results back to
648:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

650:         // sum group results into z:
651:         for (j=0; j<3; ++j) {
652:           z[current_y_index + j] = 0;
653:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
654:         }
655: #endif

657:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
658:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
659:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
660:         current_y_index += 3;
661:         break;

663:       case 2:
664:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
665:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);

667: #if defined(PETSC_USE_COMPLEX)
668:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
669:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
670: #else
671:         // run kernel:
672:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

674:         // copy results back to
675:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

677:         // sum group results into z:
678:         for (j=0; j<2; ++j) {
679:           z[current_y_index + j] = 0;
680:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
681:         }
682: #endif
683:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
684:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
685:         current_y_index += 2;
686:         break;

688:       case 1:
689:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
690:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
691:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
692:         current_y_index += 1;
693:         break;

695:       default: // 8 or more vectors left
696:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
697:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
698:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
699:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
700:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
701:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
702:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
703:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);

705: #if defined(PETSC_USE_COMPLEX)
706:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
707:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
708:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
709:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
710:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
711:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
712:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
713:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
714: #else
715:         // run kernel:
716:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

718:         // copy results back to
719:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

721:         // sum group results into z:
722:         for (j=0; j<8; ++j) {
723:           z[current_y_index + j] = 0;
724:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
725:         }
726: #endif
727:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
728:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
729:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
730:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
731:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
732:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
733:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
734:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
735:         current_y_index += 8;
736:         break;
737:     }
738:   }
739:   VecCUDARestoreArrayRead(xin,&xptr);

741:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
742:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
743:   return(0);
744: }

746: #undef MDOT_WORKGROUP_SIZE
747: #undef MDOT_WORKGROUP_NUM

749: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
750: {
751:   PetscInt                        n = xin->map->n;
752:   PetscScalar                     *xarray=NULL;
753:   thrust::device_ptr<PetscScalar> xptr;
754:   PetscErrorCode                  ierr;
755:   cudaError_t                     err;

758:   VecCUDAGetArrayWrite(xin,&xarray);
759:   if (alpha == (PetscScalar)0.0) {
760:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
761:   } else {
762:     try {
763:       xptr = thrust::device_pointer_cast(xarray);
764:       thrust::fill(xptr,xptr+n,alpha);
765:     } catch (char *ex) {
766:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
767:     }
768:   }
769:   WaitForGPU();CHKERRCUDA(ierr);
770:   VecCUDARestoreArrayWrite(xin,&xarray);
771:   return(0);
772: }

774: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
775: {
776:   PetscScalar    *xarray;
778:   PetscBLASInt   one=1,bn;
779:   cublasHandle_t cublasv2handle;
780:   cublasStatus_t cberr;

783:   if (alpha == (PetscScalar)0.0) {
784:     VecSet_SeqCUDA(xin,alpha);
785:   } else if (alpha != (PetscScalar)1.0) {
786:     PetscCUBLASGetHandle(&cublasv2handle);
787:     PetscBLASIntCast(xin->map->n,&bn);
788:     VecCUDAGetArrayReadWrite(xin,&xarray);
789:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
790:     VecCUDARestoreArrayReadWrite(xin,&xarray);
791:   }
792:   WaitForGPU();CHKERRCUDA(ierr);
793:   PetscLogFlops(xin->map->n);
794:   return(0);
795: }

797: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
798: {
799:   const PetscScalar *xarray,*yarray;
800:   PetscErrorCode    ierr;
801:   PetscBLASInt      one=1,bn;
802:   cublasHandle_t    cublasv2handle;
803:   cublasStatus_t    cberr;

806:   PetscCUBLASGetHandle(&cublasv2handle);
807:   PetscBLASIntCast(xin->map->n,&bn);
808:   VecCUDAGetArrayRead(xin,&xarray);
809:   VecCUDAGetArrayRead(yin,&yarray);
810:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
811:   WaitForGPU();CHKERRCUDA(ierr);
812:   if (xin->map->n > 0) {
813:     PetscLogFlops(2.0*xin->map->n-1);
814:   }
815:   VecCUDARestoreArrayRead(yin,&yarray);
816:   VecCUDARestoreArrayRead(xin,&xarray);
817:   return(0);
818: }

820: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
821: {
822:   const PetscScalar *xarray;
823:   PetscScalar       *yarray;
824:   PetscErrorCode    ierr;
825:   cudaError_t       err;

828:   if (xin != yin) {
829:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
830:       VecCUDAGetArrayRead(xin,&xarray);
831:       VecCUDAGetArrayWrite(yin,&yarray);
832:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
833:       WaitForGPU();CHKERRCUDA(ierr);
834:       VecCUDARestoreArrayRead(xin,&xarray);
835:       VecCUDARestoreArrayWrite(yin,&yarray);

837:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
838:       /* copy in CPU if we are on the CPU*/
839:       VecCopy_SeqCUDA_Private(xin,yin);
840:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
841:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
842:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
843:         /* copy in CPU */
844:         VecCopy_SeqCUDA_Private(xin,yin);

846:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
847:         /* copy in GPU */
848:         VecCUDAGetArrayRead(xin,&xarray);
849:         VecCUDAGetArrayWrite(yin,&yarray);
850:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
851:         VecCUDARestoreArrayRead(xin,&xarray);
852:         VecCUDARestoreArrayWrite(yin,&yarray);
853:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
854:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
855:            default to copy in GPU (this is an arbitrary choice) */
856:         VecCUDAGetArrayRead(xin,&xarray);
857:         VecCUDAGetArrayWrite(yin,&yarray);
858:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
859:         VecCUDARestoreArrayRead(xin,&xarray);
860:         VecCUDARestoreArrayWrite(yin,&yarray);
861:       } else {
862:         VecCopy_SeqCUDA_Private(xin,yin);
863:       }
864:     }
865:   }
866:   return(0);
867: }

869: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
870: {
872:   PetscBLASInt   one = 1,bn;
873:   PetscScalar    *xarray,*yarray;
874:   cublasHandle_t cublasv2handle;
875:   cublasStatus_t cberr;

878:   PetscCUBLASGetHandle(&cublasv2handle);
879:   PetscBLASIntCast(xin->map->n,&bn);
880:   if (xin != yin) {
881:     VecCUDAGetArrayReadWrite(xin,&xarray);
882:     VecCUDAGetArrayReadWrite(yin,&yarray);
883:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
884:     WaitForGPU();CHKERRCUDA(ierr);
885:     VecCUDARestoreArrayReadWrite(xin,&xarray);
886:     VecCUDARestoreArrayReadWrite(yin,&yarray);
887:   }
888:   return(0);
889: }

891: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
892: {
893:   PetscErrorCode    ierr;
894:   PetscScalar       a = alpha,b = beta;
895:   const PetscScalar *xarray;
896:   PetscScalar       *yarray;
897:   PetscBLASInt      one = 1, bn;
898:   cublasHandle_t    cublasv2handle;
899:   cublasStatus_t    cberr;
900:   cudaError_t       err;

903:   PetscCUBLASGetHandle(&cublasv2handle);
904:   PetscBLASIntCast(yin->map->n,&bn);
905:   if (a == (PetscScalar)0.0) {
906:     VecScale_SeqCUDA(yin,beta);
907:   } else if (b == (PetscScalar)1.0) {
908:     VecAXPY_SeqCUDA(yin,alpha,xin);
909:   } else if (a == (PetscScalar)1.0) {
910:     VecAYPX_SeqCUDA(yin,beta,xin);
911:   } else if (b == (PetscScalar)0.0) {
912:     VecCUDAGetArrayRead(xin,&xarray);
913:     VecCUDAGetArrayReadWrite(yin,&yarray);
914:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
915:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
916:     PetscLogFlops(xin->map->n);
917:     WaitForGPU();CHKERRCUDA(ierr);
918:     VecCUDARestoreArrayRead(xin,&xarray);
919:     VecCUDARestoreArrayReadWrite(yin,&yarray);
920:   } else {
921:     VecCUDAGetArrayRead(xin,&xarray);
922:     VecCUDAGetArrayReadWrite(yin,&yarray);
923:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
924:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
925:     VecCUDARestoreArrayRead(xin,&xarray);
926:     VecCUDARestoreArrayReadWrite(yin,&yarray);
927:     WaitForGPU();CHKERRCUDA(ierr);
928:     PetscLogFlops(3.0*xin->map->n);
929:   }
930:   return(0);
931: }

933: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
934: {
936:   PetscInt       n = zin->map->n;

939:   if (gamma == (PetscScalar)1.0) {
940:     /* z = ax + b*y + z */
941:     VecAXPY_SeqCUDA(zin,alpha,xin);
942:     VecAXPY_SeqCUDA(zin,beta,yin);
943:     PetscLogFlops(4.0*n);
944:   } else {
945:     /* z = a*x + b*y + c*z */
946:     VecScale_SeqCUDA(zin,gamma);
947:     VecAXPY_SeqCUDA(zin,alpha,xin);
948:     VecAXPY_SeqCUDA(zin,beta,yin);
949:     PetscLogFlops(5.0*n);
950:   }
951:   WaitForGPU();CHKERRCUDA(ierr);
952:   return(0);
953: }

955: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
956: {
957:   PetscInt                              n = win->map->n;
958:   const PetscScalar                     *xarray,*yarray;
959:   PetscScalar                           *warray;
960:   thrust::device_ptr<const PetscScalar> xptr,yptr;
961:   thrust::device_ptr<PetscScalar>       wptr;
962:   PetscErrorCode                        ierr;

965:   VecCUDAGetArrayReadWrite(win,&warray);
966:   VecCUDAGetArrayRead(xin,&xarray);
967:   VecCUDAGetArrayRead(yin,&yarray);
968:   try {
969:     wptr = thrust::device_pointer_cast(warray);
970:     xptr = thrust::device_pointer_cast(xarray);
971:     yptr = thrust::device_pointer_cast(yarray);
972:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
973:     WaitForGPU();CHKERRCUDA(ierr);
974:   } catch (char *ex) {
975:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
976:   }
977:   VecCUDARestoreArrayRead(xin,&xarray);
978:   VecCUDARestoreArrayRead(yin,&yarray);
979:   VecCUDARestoreArrayReadWrite(win,&warray);
980:   PetscLogFlops(n);
981:   return(0);
982: }

984: /* should do infinity norm in cuda */

986: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
987: {
988:   PetscErrorCode    ierr;
989:   PetscInt          n = xin->map->n;
990:   PetscBLASInt      one = 1, bn;
991:   const PetscScalar *xarray;
992:   cublasHandle_t    cublasv2handle;
993:   cublasStatus_t    cberr;
994:   cudaError_t       err;

997:   PetscCUBLASGetHandle(&cublasv2handle);
998:   PetscBLASIntCast(n,&bn);
999:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1000:     VecCUDAGetArrayRead(xin,&xarray);
1001:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1002:     WaitForGPU();CHKERRCUDA(ierr);
1003:     VecCUDARestoreArrayRead(xin,&xarray);
1004:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1005:   } else if (type == NORM_INFINITY) {
1006:     int  i;
1007:     VecCUDAGetArrayRead(xin,&xarray);
1008:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1009:     err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1010:     VecCUDARestoreArrayRead(xin,&xarray);
1011:   } else if (type == NORM_1) {
1012:     VecCUDAGetArrayRead(xin,&xarray);
1013:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1014:     VecCUDARestoreArrayRead(xin,&xarray);
1015:     WaitForGPU();CHKERRCUDA(ierr);
1016:     PetscLogFlops(PetscMax(n-1.0,0.0));
1017:   } else if (type == NORM_1_AND_2) {
1018:     VecNorm_SeqCUDA(xin,NORM_1,z);
1019:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1020:   }
1021:   return(0);
1022: }

1024: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1025: {
1026:   PetscErrorCode    ierr;
1027:   PetscReal         n=s->map->n;
1028:   const PetscScalar *sarray,*tarray;

1031:   VecCUDAGetArrayRead(s,&sarray);
1032:   VecCUDAGetArrayRead(t,&tarray);
1033:   VecDot_SeqCUDA(s,t,dp);
1034:   VecDot_SeqCUDA(t,t,nm);
1035:   VecCUDARestoreArrayRead(s,&sarray);
1036:   VecCUDARestoreArrayRead(t,&tarray);
1037:   WaitForGPU();CHKERRCUDA(ierr);
1038:   PetscLogFlops(4.0*n);
1039:   return(0);
1040: }

1042: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1043: {
1045:   cudaError_t    err;

1048:   if (v->spptr) {
1049:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1050:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1051:       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1052:     }
1053:     if (((Vec_CUDA*)v->spptr)->stream) {
1054:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1055:     }
1056:     PetscFree(v->spptr);
1057:   }
1058:   VecDestroy_SeqCUDA_Private(v);
1059:   return(0);
1060: }

1062: #if defined(PETSC_USE_COMPLEX)
1063: struct conjugate
1064: {
1065:   __host__ __device__
1066:     PetscScalar operator()(PetscScalar x)
1067:     {
1068:       return PetscConj(x);
1069:     }
1070: };
1071: #endif

1073: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1074: {
1075:   PetscScalar                     *xarray;
1076:   PetscErrorCode                  ierr;
1077: #if defined(PETSC_USE_COMPLEX)
1078:   PetscInt                        n = xin->map->n;
1079:   thrust::device_ptr<PetscScalar> xptr;
1080: #endif

1083:   VecCUDAGetArrayReadWrite(xin,&xarray);
1084: #if defined(PETSC_USE_COMPLEX)
1085:   try {
1086:     xptr = thrust::device_pointer_cast(xarray);
1087:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1088:     WaitForGPU();CHKERRCUDA(ierr);
1089:   } catch (char *ex) {
1090:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1091:   }
1092: #endif
1093:   VecCUDARestoreArrayReadWrite(xin,&xarray);
1094:   return(0);
1095: }

1097: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1098: {
1100:   cudaError_t    err;


1107:   if (w->data) {
1108:     if (((Vec_Seq*)w->data)->array_allocated) {
1109:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1110:     }
1111:     ((Vec_Seq*)w->data)->array = NULL;
1112:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1113:   }
1114:   if (w->spptr) {
1115:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1116:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1117:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1118:     }
1119:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1120:     PetscFree(w->spptr);
1121:   }

1123:   if (v->petscnative) {
1124:     PetscFree(w->data);
1125:     w->data = v->data;
1126:     w->valid_GPU_array = v->valid_GPU_array;
1127:     w->spptr = v->spptr;
1128:     PetscObjectStateIncrease((PetscObject)w);
1129:   } else {
1130:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1131:     w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1132:     VecCUDAAllocateCheck(w);
1133:   }
1134:   return(0);
1135: }

1137: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1138: {
1140:   cudaError_t    err;


1147:   if (v->petscnative) {
1148:     v->data = w->data;
1149:     v->valid_GPU_array = w->valid_GPU_array;
1150:     v->spptr = w->spptr;
1151:     VecCUDACopyFromGPU(v);
1152:     PetscObjectStateIncrease((PetscObject)v);
1153:     w->data = 0;
1154:     w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1155:     w->spptr = 0;
1156:   } else {
1157:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1158:     if ((Vec_CUDA*)w->spptr) {
1159:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1160:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1161:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1162:       PetscFree(w->spptr);
1163:     }
1164:   }
1165:   return(0);
1166: }

1168: /*@C
1169:    VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1171:    This function has semantics similar to VecGetArray():  the pointer
1172:    returned by this function points to a consistent view of the vector
1173:    data.  This may involve a copy operation of data from the host to the
1174:    device if the data on the device is out of date.  If the device
1175:    memory hasn't been allocated previously it will be allocated as part
1176:    of this function call.  VecCUDAGetArrayReadWrite() assumes that
1177:    the user will modify the vector data.  This is similar to
1178:    intent(inout) in fortran.

1180:    The CUDA device pointer has to be released by calling
1181:    VecCUDARestoreArrayReadWrite().  Upon restoring the vector data
1182:    the data on the host will be marked as out of date.  A subsequent
1183:    access of the host data will thus incur a data transfer from the
1184:    device to the host.


1187:    Input Parameter:
1188: .  v - the vector

1190:    Output Parameter:
1191: .  a - the CUDA device pointer

1193:    Fortran note:
1194:    This function is not currently available from Fortran.

1196:    Level: intermediate

1198: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1199: @*/
1200: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1201: {

1206:   *a   = 0;
1207:   VecCUDACopyToGPU(v);
1208:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1209:   return(0);
1210: }

1212: /*@C
1213:    VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().

1215:    This marks the host data as out of date.  Subsequent access to the
1216:    vector data on the host side with for instance VecGetArray() incurs a
1217:    data transfer.

1219:    Input Parameter:
1220: +  v - the vector
1221: -  a - the CUDA device pointer.  This pointer is invalid after
1222:        VecCUDARestoreArrayReadWrite() returns.

1224:    Fortran note:
1225:    This function is not currently available from Fortran.

1227:    Level: intermediate

1229: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1230: @*/
1231: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1232: {

1237:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1239:   PetscObjectStateIncrease((PetscObject)v);
1240:   return(0);
1241: }

1243: /*@C
1244:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1246:    This function is analogous to VecGetArrayRead():  The pointer
1247:    returned by this function points to a consistent view of the vector
1248:    data.  This may involve a copy operation of data from the host to the
1249:    device if the data on the device is out of date.  If the device
1250:    memory hasn't been allocated previously it will be allocated as part
1251:    of this function call.  VecCUDAGetArrayRead() assumes that the
1252:    user will not modify the vector data.  This is analgogous to
1253:    intent(in) in Fortran.

1255:    The CUDA device pointer has to be released by calling
1256:    VecCUDARestoreArrayRead().  If the data on the host side was
1257:    previously up to date it will remain so, i.e. data on both the device
1258:    and the host is up to date.  Accessing data on the host side does not
1259:    incur a device to host data transfer.

1261:    Input Parameter:
1262: .  v - the vector

1264:    Output Parameter:
1265: .  a - the CUDA pointer.

1267:    Fortran note:
1268:    This function is not currently available from Fortran.

1270:    Level: intermediate

1272: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1273: @*/
1274: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1275: {

1280:   *a   = 0;
1281:   VecCUDACopyToGPU(v);
1282:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1283:   return(0);
1284: }

1286: /*@C
1287:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1289:    If the data on the host side was previously up to date it will remain
1290:    so, i.e. data on both the device and the host is up to date.
1291:    Accessing data on the host side e.g. with VecGetArray() does not
1292:    incur a device to host data transfer.

1294:    Input Parameter:
1295: +  v - the vector
1296: -  a - the CUDA device pointer.  This pointer is invalid after
1297:        VecCUDARestoreArrayRead() returns.

1299:    Fortran note:
1300:    This function is not currently available from Fortran.

1302:    Level: intermediate

1304: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1305: @*/
1306: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1307: {
1310:   return(0);
1311: }

1313: /*@C
1314:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1316:    The data pointed to by the device pointer is uninitialized.  The user
1317:    may not read from this data.  Furthermore, the entire array needs to
1318:    be filled by the user to obtain well-defined behaviour.  The device
1319:    memory will be allocated by this function if it hasn't been allocated
1320:    previously.  This is analogous to intent(out) in Fortran.

1322:    The device pointer needs to be released with
1323:    VecCUDARestoreArrayWrite().  When the pointer is released the
1324:    host data of the vector is marked as out of data.  Subsequent access
1325:    of the host data with e.g. VecGetArray() incurs a device to host data
1326:    transfer.


1329:    Input Parameter:
1330: .  v - the vector

1332:    Output Parameter:
1333: .  a - the CUDA pointer

1335:    Fortran note:
1336:    This function is not currently available from Fortran.

1338:    Level: advanced

1340: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1341: @*/
1342: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1343: {

1348:   *a   = 0;
1349:   VecCUDAAllocateCheck(v);
1350:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1351:   return(0);
1352: }

1354: /*@C
1355:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1357:    Data on the host will be marked as out of date.  Subsequent access of
1358:    the data on the host side e.g. with VecGetArray() will incur a device
1359:    to host data transfer.

1361:    Input Parameter:
1362: +  v - the vector
1363: -  a - the CUDA device pointer.  This pointer is invalid after
1364:        VecCUDARestoreArrayWrite() returns.

1366:    Fortran note:
1367:    This function is not currently available from Fortran.

1369:    Level: intermediate

1371: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1372: @*/
1373: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1374: {

1379:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

1381:   PetscObjectStateIncrease((PetscObject)v);
1382:   return(0);
1383: }

1385: /*@C
1386:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1387:    GPU array provided by the user. This is useful to avoid copying an
1388:    array into a vector.

1390:    Not Collective

1392:    Input Parameters:
1393: +  vec - the vector
1394: -  array - the GPU array

1396:    Notes:
1397:    You can return to the original GPU array with a call to VecCUDAResetArray()
1398:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1399:    same time on the same vector.

1401:    Level: developer

1403: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1405: @*/
1406: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1407: {

1412:   VecCUDACopyToGPU(vin);
1413:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1414:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1415:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1416:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1417:   PetscObjectStateIncrease((PetscObject)vin);
1418:   return(0);
1419: }

1421: /*@C
1422:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1423:    with a GPU array provided by the user. This is useful to avoid copying
1424:    a GPU array into a vector.

1426:    Not Collective

1428:    Input Parameters:
1429: +  vec - the vector
1430: -  array - the GPU array

1432:    Notes:
1433:    This permanently replaces the GPU array and frees the memory associated
1434:    with the old GPU array.

1436:    The memory passed in CANNOT be freed by the user. It will be freed
1437:    when the vector is destroyed.

1439:    Not supported from Fortran

1441:    Level: developer

1443: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1445: @*/
1446: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1447: {
1448:   cudaError_t err;

1453:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1454:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1455:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1456:   PetscObjectStateIncrease((PetscObject)vin);
1457:   return(0);
1458: }

1460: /*@C
1461:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1462:    after the use of VecCUDAPlaceArray().

1464:    Not Collective

1466:    Input Parameters:
1467: .  vec - the vector

1469:    Level: developer

1471: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1473: @*/
1474: PetscErrorCode VecCUDAResetArray(Vec vin)
1475: {

1480:   VecCUDACopyToGPU(vin);
1481:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1482:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1483:   vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1484:   PetscObjectStateIncrease((PetscObject)vin);
1485:   return(0);
1486: }