Actual source code: vecviennacl.cxx

petsc-3.9.4 2018-09-11
Report Typos and Errors
  1: /*
  2:    Implements the sequential ViennaCL vectors.
  3: */

  5: #include <petscconf.h>
  6:  #include <petsc/private/vecimpl.h>
  7:  #include <../src/vec/vec/impls/dvecimpl.h>
  8:  #include <../src/vec/vec/impls/seq/seqviennacl/viennaclvecimpl.h>

 10: #include <vector>

 12: #include "viennacl/linalg/inner_prod.hpp"
 13: #include "viennacl/linalg/norm_1.hpp"
 14: #include "viennacl/linalg/norm_2.hpp"
 15: #include "viennacl/linalg/norm_inf.hpp"

 17: #ifdef VIENNACL_WITH_OPENCL
 18: #include "viennacl/ocl/backend.hpp"
 19: #endif


 22: PETSC_EXTERN PetscErrorCode VecViennaCLGetArrayReadWrite(Vec v, ViennaCLVector **a)
 23: {

 28:   *a   = 0;
 29:   VecViennaCLCopyToGPU(v);
 30:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 31:   ViennaCLWaitForGPU();
 32:   return(0);
 33: }

 35: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArrayReadWrite(Vec v, ViennaCLVector **a)
 36: {

 41:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

 43:   PetscObjectStateIncrease((PetscObject)v);
 44:   return(0);
 45: }

 47: PETSC_EXTERN PetscErrorCode VecViennaCLGetArrayRead(Vec v, const ViennaCLVector **a)
 48: {

 53:   *a   = 0;
 54:   VecViennaCLCopyToGPU(v);
 55:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 56:   ViennaCLWaitForGPU();
 57:   return(0);
 58: }

 60: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArrayRead(Vec v, const ViennaCLVector **a)
 61: {
 64:   return(0);
 65: }

 67: PETSC_EXTERN PetscErrorCode VecViennaCLGetArrayWrite(Vec v, ViennaCLVector **a)
 68: {

 73:   *a   = 0;
 74:   VecViennaCLAllocateCheck(v);
 75:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 76:   ViennaCLWaitForGPU();
 77:   return(0);
 78: }

 80: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArrayWrite(Vec v, ViennaCLVector **a)
 81: {

 86:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

 88:   PetscObjectStateIncrease((PetscObject)v);
 89:   return(0);
 90: }



 94: PETSC_EXTERN PetscErrorCode PetscViennaCLInit()
 95: {
 96:   PetscErrorCode       ierr;
 97:   char                 string[20];
 98:   PetscBool            flg,flg_cuda,flg_opencl,flg_openmp;

101:   /* ViennaCL backend selection: CUDA, OpenCL, or OpenMP */
102:   PetscOptionsGetString(NULL,NULL,"-viennacl_backend",string,12,&flg);
103:   if (flg) {
104:     try {
105:       PetscStrcasecmp(string,"cuda",&flg_cuda);
106:       PetscStrcasecmp(string,"opencl",&flg_opencl);
107:       PetscStrcasecmp(string,"openmp",&flg_openmp);

109:       /* A default (sequential) CPU backend is always available - even if OpenMP is not enabled. */
110:       if (flg_openmp) viennacl::backend::default_memory_type(viennacl::MAIN_MEMORY);
111: #if defined(PETSC_HAVE_CUDA)
112:       else if (flg_cuda) viennacl::backend::default_memory_type(viennacl::CUDA_MEMORY);
113: #endif
114: #if defined(PETSC_HAVE_OPENCL)
115:       else if (flg_opencl) viennacl::backend::default_memory_type(viennacl::OPENCL_MEMORY);
116: #endif
117:       else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Backend not recognized or available: %s.\n Pass -viennacl_view to see available backends for ViennaCL.\n", string);
118:     } catch (std::exception const & ex) {
119:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
120:     }
121:   }

123: #if defined(PETSC_HAVE_OPENCL)
124:   /* ViennaCL OpenCL device type configuration */
125:   PetscOptionsGetString(NULL,NULL,"-viennacl_opencl_device_type",string,12,&flg);
126:   if (flg) {
127:     try {
128:       PetscStrcasecmp(string,"cpu",&flg);
129:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_CPU);

131:       PetscStrcasecmp(string,"gpu",&flg);
132:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_GPU);

134:       PetscStrcasecmp(string,"accelerator",&flg);
135:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_ACCELERATOR);
136:     } catch (std::exception const & ex) {
137:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
138:     }
139:   }
140: #endif

142:   /* Print available backends */
143:   PetscOptionsHasName(NULL,NULL,"-viennacl_view",&flg);
144:   if (flg) {
145:     PetscPrintf(PETSC_COMM_WORLD, "ViennaCL backends available: ");
146: #if defined(PETSC_HAVE_CUDA)
147:     PetscPrintf(PETSC_COMM_WORLD, "CUDA, ");
148: #endif
149: #if defined(PETSC_HAVE_OPENCL)
150:     PetscPrintf(PETSC_COMM_WORLD, "OpenCL, ");
151: #endif
152: #if defined(PETSC_HAVE_OPENMP)
153:     PetscPrintf(PETSC_COMM_WORLD, "OpenMP ");
154: #else
155:     PetscPrintf(PETSC_COMM_WORLD, "OpenMP (1 thread) ");
156: #endif
157:     PetscPrintf(PETSC_COMM_WORLD, "\n");

159:     /* Print selected backends */
160:     PetscPrintf(PETSC_COMM_WORLD, "ViennaCL backend  selected: ");
161: #if defined(PETSC_HAVE_CUDA)
162:     if (viennacl::backend::default_memory_type() == viennacl::CUDA_MEMORY) {
163:       PetscPrintf(PETSC_COMM_WORLD, "CUDA ");
164:     }
165: #endif
166: #if defined(PETSC_HAVE_OPENCL)
167:     if (viennacl::backend::default_memory_type() == viennacl::OPENCL_MEMORY) {
168:       PetscPrintf(PETSC_COMM_WORLD, "OpenCL ");
169:     }
170: #endif
171: #if defined(PETSC_HAVE_OPENMP)
172:     if (viennacl::backend::default_memory_type() == viennacl::MAIN_MEMORY) {
173:       PetscPrintf(PETSC_COMM_WORLD, "OpenMP ");
174:     }
175: #else
176:     if (viennacl::backend::default_memory_type() == viennacl::MAIN_MEMORY) {
177:       PetscPrintf(PETSC_COMM_WORLD, "OpenMP (sequential - consider reconfiguration: --with-openmp=1) ");
178:     }
179: #endif
180:     PetscPrintf(PETSC_COMM_WORLD, "\n");
181:   }
182:   return(0);
183: }

185: /*
186:     Allocates space for the vector array on the Host if it does not exist.
187:     Does NOT change the PetscViennaCLFlag for the vector
188:     Does NOT zero the ViennaCL array
189:  */
190: PETSC_EXTERN PetscErrorCode VecViennaCLAllocateCheckHost(Vec v)
191: {
193:   PetscScalar    *array;
194:   Vec_Seq        *s;
195:   PetscInt       n = v->map->n;

198:   s    = (Vec_Seq*)v->data;
199:   VecViennaCLAllocateCheck(v);
200:   if (s->array == 0) {
201:     PetscMalloc1(n,&array);
202:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
203:     s->array           = array;
204:     s->array_allocated = array;
205:   }
206:   return(0);
207: }


210: /*
211:     Allocates space for the vector array on the GPU if it does not exist.
212:     Does NOT change the PetscViennaCLFlag for the vector
213:     Does NOT zero the ViennaCL array

215:  */
216: PetscErrorCode VecViennaCLAllocateCheck(Vec v)
217: {
219:   int            rank;

222:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
223:   // First allocate memory on the GPU if needed
224:   if (!v->spptr) {
225:     try {
226:       v->spptr                            = new Vec_ViennaCL;
227:       ((Vec_ViennaCL*)v->spptr)->GPUarray = new ViennaCLVector((PetscBLASInt)v->map->n);

229:     } catch(std::exception const & ex) {
230:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
231:     }
232:   }
233:   return(0);
234: }


237: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
238: PetscErrorCode VecViennaCLCopyToGPU(Vec v)
239: {

244:   VecViennaCLAllocateCheck(v);
245:   if (v->map->n > 0) {
246:     if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
247:       PetscLogEventBegin(VEC_ViennaCLCopyToGPU,v,0,0,0);
248:       try {
249:         ViennaCLVector *vec = ((Vec_ViennaCL*)v->spptr)->GPUarray;
250:         viennacl::fast_copy(*(PetscScalar**)v->data, *(PetscScalar**)v->data + v->map->n, vec->begin());
251:         ViennaCLWaitForGPU();
252:       } catch(std::exception const & ex) {
253:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
254:       }
255:       PetscLogEventEnd(VEC_ViennaCLCopyToGPU,v,0,0,0);
256:       v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
257:     }
258:   }
259:   return(0);
260: }



264: /*
265:      VecViennaCLCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
266: */
267: PetscErrorCode VecViennaCLCopyFromGPU(Vec v)
268: {

273:   VecViennaCLAllocateCheckHost(v);
274:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
275:     PetscLogEventBegin(VEC_ViennaCLCopyFromGPU,v,0,0,0);
276:     try {
277:       ViennaCLVector *vec = ((Vec_ViennaCL*)v->spptr)->GPUarray;
278:       viennacl::fast_copy(vec->begin(),vec->end(),*(PetscScalar**)v->data);
279:       ViennaCLWaitForGPU();
280:     } catch(std::exception const & ex) {
281:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
282:     }
283:     PetscLogEventEnd(VEC_ViennaCLCopyFromGPU,v,0,0,0);
284:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
285:   }
286:   return(0);
287: }


290: /* Copy on CPU */
291: static PetscErrorCode VecCopy_SeqViennaCL_Private(Vec xin,Vec yin)
292: {
293:   PetscScalar       *ya;
294:   const PetscScalar *xa;
295:   PetscErrorCode    ierr;

298:   VecViennaCLAllocateCheckHost(xin);
299:   VecViennaCLAllocateCheckHost(yin);
300:   if (xin != yin) {
301:     VecGetArrayRead(xin,&xa);
302:     VecGetArray(yin,&ya);
303:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
304:     VecRestoreArrayRead(xin,&xa);
305:     VecRestoreArray(yin,&ya);
306:   }
307:   return(0);
308: }

310: static PetscErrorCode VecSetRandom_SeqViennaCL_Private(Vec xin,PetscRandom r)
311: {
313:   PetscInt       n = xin->map->n,i;
314:   PetscScalar    *xx;

317:   VecGetArray(xin,&xx);
318:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
319:   VecRestoreArray(xin,&xx);
320:   return(0);
321: }

323: static PetscErrorCode VecDestroy_SeqViennaCL_Private(Vec v)
324: {
325:   Vec_Seq        *vs = (Vec_Seq*)v->data;

329:   PetscObjectSAWsViewOff(v);
330: #if defined(PETSC_USE_LOG)
331:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
332: #endif
333:   if (vs->array_allocated) PetscFree(vs->array_allocated);
334:   PetscFree(vs);
335:   return(0);
336: }

338: static PetscErrorCode VecResetArray_SeqViennaCL_Private(Vec vin)
339: {
340:   Vec_Seq *v = (Vec_Seq*)vin->data;

343:   v->array         = v->unplacedarray;
344:   v->unplacedarray = 0;
345:   return(0);
346: }


349: /*MC
350:    VECSEQVIENNACL - VECSEQVIENNACL = "seqviennacl" - The basic sequential vector, modified to use ViennaCL

352:    Options Database Keys:
353: . -vec_type seqviennacl - sets the vector type to VECSEQVIENNACL during a call to VecSetFromOptions()

355:   Level: beginner

357: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
358: M*/


361: PetscErrorCode VecAYPX_SeqViennaCL(Vec yin, PetscScalar alpha, Vec xin)
362: {
363:   const ViennaCLVector  *xgpu;
364:   ViennaCLVector        *ygpu;
365:   PetscErrorCode        ierr;

368:   VecViennaCLGetArrayRead(xin,&xgpu);
369:   VecViennaCLGetArrayReadWrite(yin,&ygpu);
370:   try {
371:     if (alpha != 0.0 && xin->map->n > 0) {
372:       *ygpu = *xgpu + alpha * *ygpu;
373:       PetscLogFlops(2.0*yin->map->n);
374:     } else {
375:       *ygpu = *xgpu;
376:     }
377:     ViennaCLWaitForGPU();
378:   } catch(std::exception const & ex) {
379:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
380:   }
381:   VecViennaCLRestoreArrayRead(xin,&xgpu);
382:   VecViennaCLRestoreArrayReadWrite(yin,&ygpu);
383:   return(0);
384: }


387: PetscErrorCode VecAXPY_SeqViennaCL(Vec yin,PetscScalar alpha,Vec xin)
388: {
389:   const ViennaCLVector  *xgpu;
390:   ViennaCLVector        *ygpu;
391:   PetscErrorCode        ierr;

394:   if (alpha != 0.0 && xin->map->n > 0) {
395:     VecViennaCLGetArrayRead(xin,&xgpu);
396:     VecViennaCLGetArrayReadWrite(yin,&ygpu);
397:     try {
398:       *ygpu += alpha * *xgpu;
399:       ViennaCLWaitForGPU();
400:     } catch(std::exception const & ex) {
401:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
402:     }
403:     VecViennaCLRestoreArrayRead(xin,&xgpu);
404:     VecViennaCLRestoreArrayReadWrite(yin,&ygpu);
405:     PetscLogFlops(2.0*yin->map->n);
406:   }
407:   return(0);
408: }


411: PetscErrorCode VecPointwiseDivide_SeqViennaCL(Vec win, Vec xin, Vec yin)
412: {
413:   const ViennaCLVector  *xgpu,*ygpu;
414:   ViennaCLVector        *wgpu;
415:   PetscErrorCode        ierr;

418:   if (xin->map->n > 0) {
419:     VecViennaCLGetArrayRead(xin,&xgpu);
420:     VecViennaCLGetArrayRead(yin,&ygpu);
421:     VecViennaCLGetArrayWrite(win,&wgpu);
422:     try {
423:       *wgpu = viennacl::linalg::element_div(*xgpu, *ygpu);
424:       ViennaCLWaitForGPU();
425:     } catch(std::exception const & ex) {
426:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
427:     }
428:     PetscLogFlops(win->map->n);
429:     VecViennaCLRestoreArrayRead(xin,&xgpu);
430:     VecViennaCLRestoreArrayRead(yin,&ygpu);
431:     VecViennaCLRestoreArrayWrite(win,&wgpu);
432:   }
433:   return(0);
434: }


437: PetscErrorCode VecWAXPY_SeqViennaCL(Vec win,PetscScalar alpha,Vec xin, Vec yin)
438: {
439:   const ViennaCLVector  *xgpu,*ygpu;
440:   ViennaCLVector        *wgpu;
441:   PetscErrorCode        ierr;

444:   if (alpha == 0.0 && xin->map->n > 0) {
445:     VecCopy_SeqViennaCL(yin,win);
446:   } else {
447:     VecViennaCLGetArrayRead(xin,&xgpu);
448:     VecViennaCLGetArrayRead(yin,&ygpu);
449:     VecViennaCLGetArrayWrite(win,&wgpu);
450:     if (alpha == 1.0) {
451:       try {
452:         *wgpu = *ygpu + *xgpu;
453:       } catch(std::exception const & ex) {
454:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
455:       }
456:       PetscLogFlops(win->map->n);
457:     } else if (alpha == -1.0) {
458:       try {
459:         *wgpu = *ygpu - *xgpu;
460:       } catch(std::exception const & ex) {
461:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
462:       }
463:       PetscLogFlops(win->map->n);
464:     } else {
465:       try {
466:         *wgpu = *ygpu + alpha * *xgpu;
467:       } catch(std::exception const & ex) {
468:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
469:       }
470:       PetscLogFlops(2*win->map->n);
471:     }
472:     ViennaCLWaitForGPU();
473:     VecViennaCLRestoreArrayRead(xin,&xgpu);
474:     VecViennaCLRestoreArrayRead(yin,&ygpu);
475:     VecViennaCLRestoreArrayWrite(win,&wgpu);
476:   }
477:   return(0);
478: }


481: /*
482:  * Operation x = x + sum_i alpha_i * y_i for vectors x, y_i and scalars alpha_i
483:  *
484:  * ViennaCL supports a fast evaluation of x += alpha * y and x += alpha * y + beta * z,
485:  * hence there is an iterated application of these until the final result is obtained
486:  */
487: PetscErrorCode VecMAXPY_SeqViennaCL(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
488: {
490:   PetscInt       j;

493:   for (j = 0; j < nv; ++j) {
494:     if (j+1 < nv) {
495:       VecAXPBYPCZ_SeqViennaCL(xin,alpha[j],alpha[j+1],1.0,y[j],y[j+1]);
496:       ++j;
497:     } else {
498:       VecAXPY_SeqViennaCL(xin,alpha[j],y[j]);
499:     }
500:   }
501:   ViennaCLWaitForGPU();
502:   return(0);
503: }


506: PetscErrorCode VecDot_SeqViennaCL(Vec xin,Vec yin,PetscScalar *z)
507: {
508:   const ViennaCLVector  *xgpu,*ygpu;
509:   PetscErrorCode        ierr;

512:   if (xin->map->n > 0) {
513:     VecViennaCLGetArrayRead(xin,&xgpu);
514:     VecViennaCLGetArrayRead(yin,&ygpu);
515:     try {
516:       *z = viennacl::linalg::inner_prod(*xgpu,*ygpu);
517:     } catch(std::exception const & ex) {
518:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
519:     }
520:     if (xin->map->n >0) {
521:       PetscLogFlops(2.0*xin->map->n-1);
522:     }
523:     ViennaCLWaitForGPU();
524:     VecViennaCLRestoreArrayRead(xin,&xgpu);
525:     VecViennaCLRestoreArrayRead(yin,&ygpu);
526:   } else *z = 0.0;
527:   return(0);
528: }



532: /*
533:  * Operation z[j] = dot(x, y[j])
534:  *
535:  * We use an iterated application of dot() for each j. For small ranges of j this is still faster than an allocation of extra memory in order to use gemv().
536:  */
537: PetscErrorCode VecMDot_SeqViennaCL(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
538: {
539:   PetscErrorCode       ierr;
540:   PetscInt             n = xin->map->n,i;
541:   const ViennaCLVector *xgpu,*ygpu;
542:   Vec                  *yyin = (Vec*)yin;
543:   std::vector<viennacl::vector_base<PetscScalar> const *> ygpu_array(nv);

546:   if (xin->map->n > 0) {
547:     VecViennaCLGetArrayRead(xin,&xgpu);
548:     for (i=0; i<nv; i++) {
549:       VecViennaCLGetArrayRead(yyin[i],&ygpu);
550:       ygpu_array[i] = ygpu;
551:     }

553:     viennacl::vector_tuple<PetscScalar> y_tuple(ygpu_array);
554:     ViennaCLVector result = viennacl::linalg::inner_prod(*xgpu, y_tuple);

556:     for (i=0; i<nv; i++) {
557:       viennacl::copy(result.begin(), result.end(), z);
558:       VecViennaCLRestoreArrayRead(yyin[i],&ygpu);
559:     }

561:     ViennaCLWaitForGPU();
562:     VecViennaCLRestoreArrayRead(xin,&xgpu);
563:     PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
564:   } else {
565:     for (i=0; i<nv; i++) z[i] = 0.0;
566:   }
567:   return(0);
568: }

570: PetscErrorCode VecMTDot_SeqViennaCL(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
571: {

575:   /* Since complex case is not supported at the moment, this is the same as VecMDot_SeqViennaCL */
576:   VecMDot_SeqViennaCL(xin,nv,yin,z);
577:   ViennaCLWaitForGPU();
578:   return(0);
579: }


582: PetscErrorCode VecSet_SeqViennaCL(Vec xin,PetscScalar alpha)
583: {
584:   ViennaCLVector *xgpu;

588:   if (xin->map->n > 0) {
589:     VecViennaCLGetArrayWrite(xin,&xgpu);
590:     try {
591:       *xgpu = viennacl::scalar_vector<PetscScalar>(xgpu->size(), alpha);
592:       ViennaCLWaitForGPU();
593:     } catch(std::exception const & ex) {
594:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
595:     }
596:     VecViennaCLRestoreArrayWrite(xin,&xgpu);
597:   }
598:   return(0);
599: }

601: PetscErrorCode VecScale_SeqViennaCL(Vec xin, PetscScalar alpha)
602: {
603:   ViennaCLVector *xgpu;

607:   if (alpha == 0.0 && xin->map->n > 0) {
608:     VecSet_SeqViennaCL(xin,alpha);
609:     PetscLogFlops(xin->map->n);
610:   } else if (alpha != 1.0 && xin->map->n > 0) {
611:     VecViennaCLGetArrayReadWrite(xin,&xgpu);
612:     try {
613:       *xgpu *= alpha;
614:       ViennaCLWaitForGPU();
615:     } catch(std::exception const & ex) {
616:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
617:     }
618:     VecViennaCLRestoreArrayReadWrite(xin,&xgpu);
619:     PetscLogFlops(xin->map->n);
620:   }
621:   return(0);
622: }


625: PetscErrorCode VecTDot_SeqViennaCL(Vec xin,Vec yin,PetscScalar *z)
626: {

630:   /* Since complex case is not supported at the moment, this is the same as VecDot_SeqViennaCL */
631:   VecDot_SeqViennaCL(xin, yin, z);
632:   ViennaCLWaitForGPU();
633:   return(0);
634: }


637: PetscErrorCode VecCopy_SeqViennaCL(Vec xin,Vec yin)
638: {
639:   const ViennaCLVector *xgpu;
640:   ViennaCLVector       *ygpu;
641:   PetscErrorCode       ierr;

644:   if (xin != yin && xin->map->n > 0) {
645:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
646:       VecViennaCLGetArrayRead(xin,&xgpu);
647:       VecViennaCLGetArrayWrite(yin,&ygpu);
648:       try {
649:         *ygpu = *xgpu;
650:         ViennaCLWaitForGPU();
651:       } catch(std::exception const & ex) {
652:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
653:       }
654:       VecViennaCLRestoreArrayRead(xin,&xgpu);
655:       VecViennaCLRestoreArrayWrite(yin,&ygpu);

657:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
658:       /* copy in CPU if we are on the CPU*/
659:       VecCopy_SeqViennaCL_Private(xin,yin);
660:       ViennaCLWaitForGPU();
661:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
662:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
663:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
664:         /* copy in CPU */
665:         VecCopy_SeqViennaCL_Private(xin,yin);
666:         ViennaCLWaitForGPU();
667:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
668:         /* copy in GPU */
669:         VecViennaCLGetArrayRead(xin,&xgpu);
670:         VecViennaCLGetArrayWrite(yin,&ygpu);
671:         try {
672:           *ygpu = *xgpu;
673:           ViennaCLWaitForGPU();
674:         } catch(std::exception const & ex) {
675:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
676:         }
677:         VecViennaCLRestoreArrayRead(xin,&xgpu);
678:         VecViennaCLRestoreArrayWrite(yin,&ygpu);
679:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
680:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
681:            default to copy in GPU (this is an arbitrary choice) */
682:         VecViennaCLGetArrayRead(xin,&xgpu);
683:         VecViennaCLGetArrayWrite(yin,&ygpu);
684:         try {
685:           *ygpu = *xgpu;
686:           ViennaCLWaitForGPU();
687:         } catch(std::exception const & ex) {
688:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
689:         }
690:         VecViennaCLRestoreArrayRead(xin,&xgpu);
691:         VecViennaCLRestoreArrayWrite(yin,&ygpu);
692:       } else {
693:         VecCopy_SeqViennaCL_Private(xin,yin);
694:         ViennaCLWaitForGPU();
695:       }
696:     }
697:   }
698:   return(0);
699: }


702: PetscErrorCode VecSwap_SeqViennaCL(Vec xin,Vec yin)
703: {
705:   ViennaCLVector *xgpu,*ygpu;

708:   if (xin != yin && xin->map->n > 0) {
709:     VecViennaCLGetArrayReadWrite(xin,&xgpu);
710:     VecViennaCLGetArrayReadWrite(yin,&ygpu);

712:     try {
713:       viennacl::swap(*xgpu, *ygpu);
714:       ViennaCLWaitForGPU();
715:     } catch(std::exception const & ex) {
716:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
717:     }
718:     VecViennaCLRestoreArrayReadWrite(xin,&xgpu);
719:     VecViennaCLRestoreArrayReadWrite(yin,&ygpu);
720:   }
721:   return(0);
722: }


725: // y = alpha * x + beta * y
726: PetscErrorCode VecAXPBY_SeqViennaCL(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
727: {
728:   PetscErrorCode       ierr;
729:   PetscScalar          a = alpha,b = beta;
730:   const ViennaCLVector *xgpu;
731:   ViennaCLVector       *ygpu;

734:   if (a == 0.0 && xin->map->n > 0) {
735:     VecScale_SeqViennaCL(yin,beta);
736:   } else if (b == 1.0 && xin->map->n > 0) {
737:     VecAXPY_SeqViennaCL(yin,alpha,xin);
738:   } else if (a == 1.0 && xin->map->n > 0) {
739:     VecAYPX_SeqViennaCL(yin,beta,xin);
740:   } else if (b == 0.0 && xin->map->n > 0) {
741:     VecViennaCLGetArrayRead(xin,&xgpu);
742:     VecViennaCLGetArrayReadWrite(yin,&ygpu);
743:     try {
744:       *ygpu = *xgpu * alpha;
745:       ViennaCLWaitForGPU();
746:     } catch(std::exception const & ex) {
747:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
748:     }
749:     PetscLogFlops(xin->map->n);
750:     VecViennaCLRestoreArrayRead(xin,&xgpu);
751:     VecViennaCLRestoreArrayReadWrite(yin,&ygpu);
752:   } else if (xin->map->n > 0) {
753:     VecViennaCLGetArrayRead(xin,&xgpu);
754:     VecViennaCLGetArrayReadWrite(yin,&ygpu);
755:     try {
756:       *ygpu = *xgpu * alpha + *ygpu * beta;
757:       ViennaCLWaitForGPU();
758:     } catch(std::exception const & ex) {
759:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
760:     }
761:     VecViennaCLRestoreArrayRead(xin,&xgpu);
762:     VecViennaCLRestoreArrayReadWrite(yin,&ygpu);
763:     PetscLogFlops(3.0*xin->map->n);
764:   }
765:   return(0);
766: }


769: /* operation  z = alpha * x + beta *y + gamma *z*/
770: PetscErrorCode VecAXPBYPCZ_SeqViennaCL(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
771: {
772:   PetscErrorCode       ierr;
773:   PetscInt             n = zin->map->n;
774:   const ViennaCLVector *xgpu,*ygpu;
775:   ViennaCLVector       *zgpu;

778:   VecViennaCLGetArrayRead(xin,&xgpu);
779:   VecViennaCLGetArrayRead(yin,&ygpu);
780:   VecViennaCLGetArrayReadWrite(zin,&zgpu);
781:   if (alpha == 0.0 && xin->map->n > 0) {
782:     try {
783:       if (beta == 0.0) {
784:         *zgpu = gamma * *zgpu;
785:         ViennaCLWaitForGPU();
786:         PetscLogFlops(1.0*n);
787:       } else if (gamma == 0.0) {
788:         *zgpu = beta * *ygpu;
789:         ViennaCLWaitForGPU();
790:         PetscLogFlops(1.0*n);
791:       } else {
792:         *zgpu = beta * *ygpu + gamma * *zgpu;
793:         ViennaCLWaitForGPU();
794:         PetscLogFlops(3.0*n);
795:       }
796:     } catch(std::exception const & ex) {
797:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
798:     }
799:     PetscLogFlops(3.0*n);
800:   } else if (beta == 0.0 && xin->map->n > 0) {
801:     try {
802:       if (gamma == 0.0) {
803:         *zgpu = alpha * *xgpu;
804:         ViennaCLWaitForGPU();
805:         PetscLogFlops(1.0*n);
806:       } else {
807:         *zgpu = alpha * *xgpu + gamma * *zgpu;
808:         ViennaCLWaitForGPU();
809:         PetscLogFlops(3.0*n);
810:       }
811:     } catch(std::exception const & ex) {
812:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
813:     }
814:   } else if (gamma == 0.0 && xin->map->n > 0) {
815:     try {
816:       *zgpu = alpha * *xgpu + beta * *ygpu;
817:       ViennaCLWaitForGPU();
818:     } catch(std::exception const & ex) {
819:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
820:     }
821:     PetscLogFlops(3.0*n);
822:   } else if (xin->map->n > 0) {
823:     try {
824:       /* Split operation into two steps. This is not completely ideal, but avoids temporaries (which are far worse) */
825:       if (gamma != 1.0)
826:         *zgpu *= gamma;
827:       *zgpu += alpha * *xgpu + beta * *ygpu;
828:       ViennaCLWaitForGPU();
829:     } catch(std::exception const & ex) {
830:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
831:     }
832:     VecViennaCLRestoreArrayReadWrite(zin,&zgpu);
833:     VecViennaCLRestoreArrayRead(xin,&xgpu);
834:     VecViennaCLRestoreArrayRead(yin,&ygpu);
835:     PetscLogFlops(5.0*n);
836:   }
837:   return(0);
838: }

840: PetscErrorCode VecPointwiseMult_SeqViennaCL(Vec win,Vec xin,Vec yin)
841: {
842:   PetscErrorCode       ierr;
843:   PetscInt             n = win->map->n;
844:   const ViennaCLVector *xgpu,*ygpu;
845:   ViennaCLVector       *wgpu;

848:   if (xin->map->n > 0) {
849:     VecViennaCLGetArrayRead(xin,&xgpu);
850:     VecViennaCLGetArrayRead(yin,&ygpu);
851:     VecViennaCLGetArrayReadWrite(win,&wgpu);
852:     try {
853:       *wgpu = viennacl::linalg::element_prod(*xgpu, *ygpu);
854:       ViennaCLWaitForGPU();
855:     } catch(std::exception const & ex) {
856:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
857:     }
858:     VecViennaCLRestoreArrayRead(xin,&xgpu);
859:     VecViennaCLRestoreArrayRead(yin,&ygpu);
860:     VecViennaCLRestoreArrayReadWrite(win,&wgpu);
861:     PetscLogFlops(n);
862:   }
863:   return(0);
864: }


867: PetscErrorCode VecNorm_SeqViennaCL(Vec xin,NormType type,PetscReal *z)
868: {
869:   PetscErrorCode       ierr;
870:   PetscInt             n = xin->map->n;
871:   PetscBLASInt         bn;
872:   const ViennaCLVector *xgpu;

875:   if (xin->map->n > 0) {
876:     PetscBLASIntCast(n,&bn);
877:     VecViennaCLGetArrayRead(xin,&xgpu);
878:     if (type == NORM_2 || type == NORM_FROBENIUS) {
879:       try {
880:         *z = viennacl::linalg::norm_2(*xgpu);
881:         ViennaCLWaitForGPU();
882:       } catch(std::exception const & ex) {
883:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
884:       }
885:       PetscLogFlops(PetscMax(2.0*n-1,0.0));
886:     } else if (type == NORM_INFINITY) {
887:       VecViennaCLGetArrayRead(xin,&xgpu);
888:       try {
889:         *z = viennacl::linalg::norm_inf(*xgpu);
890:         ViennaCLWaitForGPU();
891:       } catch(std::exception const & ex) {
892:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
893:       }
894:       VecViennaCLRestoreArrayRead(xin,&xgpu);
895:     } else if (type == NORM_1) {
896:       try {
897:         *z = viennacl::linalg::norm_1(*xgpu);
898:         ViennaCLWaitForGPU();
899:       } catch(std::exception const & ex) {
900:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
901:       }
902:       PetscLogFlops(PetscMax(n-1.0,0.0));
903:     } else if (type == NORM_1_AND_2) {
904:       try {
905:         *z     = viennacl::linalg::norm_1(*xgpu);
906:         *(z+1) = viennacl::linalg::norm_2(*xgpu);
907:         ViennaCLWaitForGPU();
908:       } catch(std::exception const & ex) {
909:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
910:       }
911:       PetscLogFlops(PetscMax(2.0*n-1,0.0));
912:       PetscLogFlops(PetscMax(n-1.0,0.0));
913:     }
914:     VecViennaCLRestoreArrayRead(xin,&xgpu);
915:   } else if (type == NORM_1_AND_2) {
916:     *z      = 0.0;
917:     *(z+1)  = 0.0;
918:   } else *z = 0.0;
919:   return(0);
920: }


923: PetscErrorCode VecSetRandom_SeqViennaCL(Vec xin,PetscRandom r)
924: {

928:   VecSetRandom_SeqViennaCL_Private(xin,r);
929:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
930:   return(0);
931: }

933: PetscErrorCode VecResetArray_SeqViennaCL(Vec vin)
934: {

939:   VecViennaCLCopyFromGPU(vin);
940:   VecResetArray_SeqViennaCL_Private(vin);
941:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
942:   return(0);
943: }

945: PetscErrorCode VecPlaceArray_SeqViennaCL(Vec vin,const PetscScalar *a)
946: {

951:   VecViennaCLCopyFromGPU(vin);
952:   VecPlaceArray_Seq(vin,a);
953:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
954:   return(0);
955: }


958: PetscErrorCode VecReplaceArray_SeqViennaCL(Vec vin,const PetscScalar *a)
959: {

964:   VecViennaCLCopyFromGPU(vin);
965:   VecReplaceArray_Seq(vin,a);
966:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
967:   return(0);
968: }


971: /*@
972:    VecCreateSeqViennaCL - Creates a standard, sequential array-style vector.

974:    Collective on MPI_Comm

976:    Input Parameter:
977: +  comm - the communicator, should be PETSC_COMM_SELF
978: -  n - the vector length

980:    Output Parameter:
981: .  V - the vector

983:    Notes:
984:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
985:    same type as an existing vector.

987:    Level: intermediate

989:    Concepts: vectors^creating sequential

991: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
992: @*/
993: PetscErrorCode  VecCreateSeqViennaCL(MPI_Comm comm,PetscInt n,Vec *v)
994: {

998:   VecCreate(comm,v);
999:   VecSetSizes(*v,n,n);
1000:   VecSetType(*v,VECSEQVIENNACL);
1001:   return(0);
1002: }


1005: /*  VecDotNorm2 - computes the inner product of two vectors and the 2-norm squared of the second vector
1006:  *
1007:  *  Simply reuses VecDot() and VecNorm(). Performance improvement through custom kernel (kernel generator) possible.
1008:  */
1009: PetscErrorCode VecDotNorm2_SeqViennaCL(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1010: {
1011:   PetscErrorCode                         ierr;

1014:   VecDot_SeqViennaCL(s,t,dp);
1015:   VecNorm_SeqViennaCL(t,NORM_2,nm);
1016:   *nm *= *nm; //squared norm required
1017:   return(0);
1018: }

1020: PetscErrorCode VecDuplicate_SeqViennaCL(Vec win,Vec *V)
1021: {

1025:   VecCreateSeqViennaCL(PetscObjectComm((PetscObject)win),win->map->n,V);
1026:   PetscLayoutReference(win->map,&(*V)->map);
1027:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1028:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1029:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1030:   return(0);
1031: }

1033: PetscErrorCode VecDestroy_SeqViennaCL(Vec v)
1034: {

1038:   try {
1039:     if (v->spptr) {
1040:       delete ((Vec_ViennaCL*)v->spptr)->GPUarray;
1041:       delete (Vec_ViennaCL*) v->spptr;
1042:     }
1043:   } catch(char *ex) {
1044:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
1045:   }
1046:   VecDestroy_SeqViennaCL_Private(v);
1047:   return(0);
1048: }


1051: PETSC_EXTERN PetscErrorCode VecCreate_SeqViennaCL(Vec V)
1052: {
1054:   PetscMPIInt    size;

1057:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1058:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQVIENNACL on more than one process");
1059:   VecCreate_Seq_Private(V,0);
1060:   PetscObjectChangeTypeName((PetscObject)V,VECSEQVIENNACL);

1062:   V->ops->dot             = VecDot_SeqViennaCL;
1063:   V->ops->norm            = VecNorm_SeqViennaCL;
1064:   V->ops->tdot            = VecTDot_SeqViennaCL;
1065:   V->ops->scale           = VecScale_SeqViennaCL;
1066:   V->ops->copy            = VecCopy_SeqViennaCL;
1067:   V->ops->set             = VecSet_SeqViennaCL;
1068:   V->ops->swap            = VecSwap_SeqViennaCL;
1069:   V->ops->axpy            = VecAXPY_SeqViennaCL;
1070:   V->ops->axpby           = VecAXPBY_SeqViennaCL;
1071:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqViennaCL;
1072:   V->ops->pointwisemult   = VecPointwiseMult_SeqViennaCL;
1073:   V->ops->pointwisedivide = VecPointwiseDivide_SeqViennaCL;
1074:   V->ops->setrandom       = VecSetRandom_SeqViennaCL;
1075:   V->ops->dot_local       = VecDot_SeqViennaCL;
1076:   V->ops->tdot_local      = VecTDot_SeqViennaCL;
1077:   V->ops->norm_local      = VecNorm_SeqViennaCL;
1078:   V->ops->mdot_local      = VecMDot_SeqViennaCL;
1079:   V->ops->mtdot_local     = VecMTDot_SeqViennaCL;
1080:   V->ops->maxpy           = VecMAXPY_SeqViennaCL;
1081:   V->ops->mdot            = VecMDot_SeqViennaCL;
1082:   V->ops->mtdot           = VecMTDot_SeqViennaCL;
1083:   V->ops->aypx            = VecAYPX_SeqViennaCL;
1084:   V->ops->waxpy           = VecWAXPY_SeqViennaCL;
1085:   V->ops->dotnorm2        = VecDotNorm2_SeqViennaCL;
1086:   V->ops->placearray      = VecPlaceArray_SeqViennaCL;
1087:   V->ops->replacearray    = VecReplaceArray_SeqViennaCL;
1088:   V->ops->resetarray      = VecResetArray_SeqViennaCL;
1089:   V->ops->destroy         = VecDestroy_SeqViennaCL;
1090:   V->ops->duplicate       = VecDuplicate_SeqViennaCL;

1092:   VecViennaCLAllocateCheck(V);
1093:   V->valid_GPU_array      = PETSC_OFFLOAD_GPU;
1094:   VecSet(V,0.0);
1095:   return(0);
1096: }