Actual source code: mathara.cu

  1: #include <hara.h>
  2: #include <distributed/distributed_hara_handle.h>
  3: #include <distributed/distributed_hmatrix.h>
  4: #include <distributed/distributed_geometric_construction.h>
  5: #include <distributed/distributed_hgemv.h>
  6: #include <petsc/private/matimpl.h>
  7: #include <petscsf.h>

  9: #define MatHaraGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 11: // TODO HARA:
 12: // MatDuplicate buggy with commsize > 1
 13: // kernel needs (global?) id of points (issues with Chebyshev points and coupling matrix computation)
 14: // unsymmetrix DistributedHMatrix (transpose for distributed_hgemv?)
 15: // Unify interface for sequential and parallel?
 16: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 17: // Fix includes:
 18: // - everything under hara/ dir (except for hara.h)
 19: // - fix kblas includes
 20: // - namespaces?
 21: // Fix naming convention DistributedHMatrix_GPU vs GPU_HMatrix
 22: // Diagnostics? FLOPS, MEMORY USAGE IN PARALLEL
 23: // Why do we need to template the admissibility condition in the hmatrix construction?
 24: //
 25: template<typename T, int D>
 26: struct PetscPointCloud
 27: {
 28:   static const int Dim = D;
 29:   typedef T ElementType;

 31:   struct Point
 32:   {
 33:     T x[D];
 34:     Point() {
 35:       for (int i = 0; i < D; i++) x[i] = 0;
 36:     }
 37:     Point(const T xx[]) {
 38:       for (int i = 0; i < D; i++) x[i] = xx[i];
 39:     }
 40:   };

 42:   std::vector<Point> pts;

 44:   // Must return the number of data points
 45:   inline size_t kdtree_get_point_count() const { return pts.size(); }

 47:   // Returns the dim'th component of the idx'th point in the class:
 48:   inline T kdtree_get_pt(const size_t idx, int dim) const
 49:   {
 50:     return pts[idx].x[dim];
 51:   }

 53:   inline T get_pt(const size_t idx, int dim) const
 54:   {
 55:     return kdtree_get_pt(idx, dim);
 56:   }

 58:   inline bool kdtree_ignore_point(const size_t idx) const { return false; }

 60:   // Optional bounding-box computation: return false to default to a standard bbox computation loop.
 61:   //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
 62:   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
 63:   template <class BBOX>
 64:   bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }

 66:   PetscPointCloud(PetscInt,const T[]);
 67:   PetscPointCloud(PetscPointCloud&);
 68: };

 70: template<typename T, int D>
 71: PetscPointCloud<T,D>::PetscPointCloud(PetscInt n, const T coords[])
 72: {
 73:   this->pts.resize(n);
 74:   for (PetscInt i = 0; i < n; i++) {
 75:     Point p(coords);
 76:     this->pts[i] = p;
 77:     coords += D;
 78:   }
 79: }

 81: template<typename T, int D>
 82: PetscPointCloud<T,D>::PetscPointCloud(PetscPointCloud<T,D>& other)
 83: {
 84:   this->pts = other.pts;
 85: }

 87: template <typename T>
 88: using PetscPointCloud3D = PetscPointCloud<T,3>;
 89: template <typename T>
 90: using PetscPointCloud2D = PetscPointCloud<T,2>;
 91: template <typename T>
 92: using PetscPointCloud1D = PetscPointCloud<T,1>;

 94: template<typename T, int Dim>
 95: struct PetscFunctionGenerator
 96: {
 97: private:
 98:   MatHaraKernel k;
 99:   void          *ctx;

101: public:
102:     PetscFunctionGenerator(MatHaraKernel k, void* ctx) { this->k = k; this->ctx = ctx; }
103:     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->ctx = other.ctx; }
104:     T operator()(PetscReal pt1[Dim], PetscReal pt2[Dim])
105:     {
106:         return (T)(this->k ? (*this->k)(Dim,pt1,pt2,this->ctx) : 0);
107:     }
108: };

110: template <typename T>
111: using PetscFunctionGenerator3D = PetscFunctionGenerator<T,3>;
112: template <typename T>
113: using PetscFunctionGenerator2D = PetscFunctionGenerator<T,2>;
114: template <typename T>
115: using PetscFunctionGenerator1D = PetscFunctionGenerator<T,1>;

117: #include <../src/mat/impls/hara/matharasampler.hpp>

119: typedef struct {
120:   distributedHaraHandle_t handle;

122:   /* two different classes at the moment */
123:   HMatrix *hmatrix;
124:   DistributedHMatrix *dist_hmatrix;

126:   /* May use permutations */
127:   PetscSF sf;
128:   PetscLayout hara_rmap;
129:   thrust::host_vector<PetscScalar> *xx,*yy;
130:   PetscInt xxs,yys;
131:   PetscBool multsetup;

133:   /* GPU */
134: #if defined(HARA_USE_GPU)
135:   GPU_HMatrix *hmatrix_gpu;
136:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
137:   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
138:   PetscInt xxs_gpu,yys_gpu;
139: #endif

141:   /* construction from matvecs */
142:   PetscMatrixSampler* sampler;

144:   /* Admissibility */
145:   PetscReal eta;
146:   PetscInt  leafsize;

148:   /* for dof reordering */
149:   PetscInt spacedim;
150:   PetscPointCloud1D<PetscReal> *ptcloud1;
151:   PetscPointCloud2D<PetscReal> *ptcloud2;
152:   PetscPointCloud3D<PetscReal> *ptcloud3;

154:   /* kernel for generating matrix entries */
155:   PetscFunctionGenerator1D<PetscScalar> *kernel1;
156:   PetscFunctionGenerator2D<PetscScalar> *kernel2;
157:   PetscFunctionGenerator3D<PetscScalar> *kernel3;

159:   /* customization */
160:   PetscInt  basisord;
161:   PetscInt  max_rank;
162:   PetscInt  bs;
163:   PetscReal rtol;
164:   PetscInt  norm_max_samples;
165:   PetscBool check_construction;

167:   /* keeps track of MatScale values */
168:   PetscScalar s;
169: } Mat_HARA;

171: static PetscErrorCode MatDestroy_HARA(Mat A)
172: {
173:   Mat_HARA       *a = (Mat_HARA*)A->data;

177:   haraDestroyDistributedHandle(a->handle);
178:   delete a->hmatrix;
179:   delete a->dist_hmatrix;
180:   PetscSFDestroy(&a->sf);
181:   PetscLayoutDestroy(&a->hara_rmap);
182:   delete a->xx;
183:   delete a->yy;
184: #if defined(HARA_USE_GPU)
185:   delete a->hmatrix_gpu;
186:   delete a->dist_hmatrix_gpu;
187:   delete a->xx_gpu;
188:   delete a->yy_gpu;
189: #endif
190:   delete a->sampler;
191:   delete a->ptcloud1;
192:   delete a->ptcloud2;
193:   delete a->ptcloud3;
194:   delete a->kernel1;
195:   delete a->kernel2;
196:   delete a->kernel3;
197:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",NULL);
198:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",NULL);
199:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",NULL);
200:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",NULL);
201:   PetscObjectChangeTypeName((PetscObject)A,NULL);
202:   PetscFree(A->data);
203:   return(0);
204: }

206: static PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
207: {
208:   PetscSF           rankssf;
209:   const PetscSFNode *iremote;
210:   PetscSFNode       *viremote,*rremotes;
211:   const PetscInt    *ilocal;
212:   PetscInt          *vilocal = NULL,*ldrs;
213:   const PetscMPIInt *ranks;
214:   PetscMPIInt       *sranks;
215:   PetscInt          nranks,nr,nl,vnr,vnl,i,v,j,maxl;
216:   MPI_Comm          comm;
217:   PetscErrorCode    ierr;

220:   if (nv == 1) {
221:     PetscObjectReference((PetscObject)sf);
222:     *vsf = sf;
223:     return(0);
224:   }
225:   PetscObjectGetComm((PetscObject)sf,&comm);
226:   PetscSFGetGraph(sf,&nr,&nl,&ilocal,&iremote);
227:   PetscSFGetLeafRange(sf,NULL,&maxl);
228:   maxl += 1;
229:   if (ldl == PETSC_DECIDE) ldl = maxl;
230:   if (ldr == PETSC_DECIDE) ldr = nr;
231:   if (ldr < nr) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldr,nr);
232:   if (ldl < maxl) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldl,maxl);
233:   vnr  = nr*nv;
234:   vnl  = nl*nv;
235:   PetscMalloc1(vnl,&viremote);
236:   if (ilocal) {
237:     PetscMalloc1(vnl,&vilocal);
238:   }

240:   /* TODO: Should this special SF be available, e.g.
241:      PetscSFGetRanksSF or similar? */
242:   PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
243:   PetscMalloc1(nranks,&sranks);
244:   PetscArraycpy(sranks,ranks,nranks);
245:   PetscSortMPIInt(nranks,sranks);
246:   PetscMalloc1(nranks,&rremotes);
247:   for (i=0;i<nranks;i++) {
248:     rremotes[i].rank  = sranks[i];
249:     rremotes[i].index = 0;
250:   }
251:   PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&rankssf);
252:   PetscSFSetGraph(rankssf,1,nranks,NULL,PETSC_OWN_POINTER,rremotes,PETSC_OWN_POINTER);
253:   PetscMalloc1(nranks,&ldrs);
254:   PetscSFBcastBegin(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
255:   PetscSFBcastEnd(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
256:   PetscSFDestroy(&rankssf);

258:   j = -1;
259:   for (i=0;i<nl;i++) {
260:     const PetscInt r  = iremote[i].rank;
261:     const PetscInt ii = iremote[i].index;

263:     if (j < 0 || sranks[j] != r) {
264:       PetscFindMPIInt(r,nranks,sranks,&j);
265:     }
266:     if (j < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unable to locate neighbor rank %D",r);
267:     for (v=0;v<nv;v++) {
268:       viremote[v*nl + i].rank  = r;
269:       viremote[v*nl + i].index = v*ldrs[j] + ii;
270:       if (ilocal) vilocal[v*nl + i] = v*ldl + ilocal[i];
271:     }
272:   }
273:   PetscFree(sranks);
274:   PetscFree(ldrs);
275:   PetscSFCreate(comm,vsf);
276:   PetscSFSetGraph(*vsf,vnr,vnl,vilocal,PETSC_OWN_POINTER,viremote,PETSC_OWN_POINTER);
277:   return(0);
278: }

280: static PetscErrorCode VecSign(Vec v, Vec s)
281: {
282:   const PetscScalar *av;
283:   PetscScalar       *as;
284:   PetscInt          i,n;
285:   PetscErrorCode    ierr;

290:   VecGetArrayRead(v,&av);
291:   VecGetArrayWrite(s,&as);
292:   VecGetLocalSize(s,&n);
293:   VecGetLocalSize(v,&i);
294:   if (i != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Invalid local sizes %D != %D",i,n);
295:   for (i=0;i<n;i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
296:   VecRestoreArrayWrite(s,&as);
297:   VecRestoreArrayRead(v,&av);
298:   return(0);
299: }

301: /* these are approximate norms */
302: /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
303:    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
304: static PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal* n)
305: {
306:   Vec            x,y,w,z;
307:   PetscReal      normz,adot;
308:   PetscScalar    dot;
309:   PetscInt       i,j,N,jold = -1;

313:   switch (normtype) {
314:   case NORM_INFINITY:
315:   case NORM_1:
316:     if (normsamples < 0) normsamples = 10; /* pure guess */
317:     if (normtype == NORM_INFINITY) {
318:       Mat B;
319:       MatCreateTranspose(A,&B);
320:       A = B;
321:     } else {
322:       PetscObjectReference((PetscObject)A);
323:     }
324:     MatCreateVecs(A,&x,&y);
325:     MatCreateVecs(A,&z,&w);
326:     VecGetSize(x,&N);
327:     VecSet(x,1./N);
328:     VecSetOption(x,VEC_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE);
329:     *n   = 0.0;
330:     for (i = 0; i < normsamples; i++) {
331:       MatMult(A,x,y);
332:       VecSign(y,w);
333:       MatMultTranspose(A,w,z);
334:       VecNorm(z,NORM_INFINITY,&normz);
335:       VecDot(x,z,&dot);
336:       adot = PetscAbsScalar(dot);
337:       PetscInfo4(A,"%s norm it %D -> (%g %g)\n",NormTypes[normtype],i,(double)normz,(double)adot);
338:       if (normz <= adot && i > 0) {
339:         VecNorm(y,NORM_1,n);
340:         break;
341:       }
342:       VecSet(x,0.);
343:       VecMax(z,&j,&normz);
344:       if (j == jold) {
345:         VecNorm(y,NORM_1,n);
346:         PetscInfo2(A,"%s norm it %D -> breakdown (j==jold)\n",NormTypes[normtype],i);
347:         break;
348:       }
349:       jold = j;
350:       VecSetValue(x,j,1.0,INSERT_VALUES);
351:       VecAssemblyBegin(x);
352:       VecAssemblyEnd(x);
353:     }
354:     MatDestroy(&A);
355:     VecDestroy(&x);
356:     VecDestroy(&w);
357:     VecDestroy(&y);
358:     VecDestroy(&z);
359:     break;
360:   case NORM_2:
361:     if (normsamples < 0) normsamples = 20; /* pure guess */
362:     MatCreateVecs(A,&x,&y);
363:     MatCreateVecs(A,&z,NULL);
364:     VecSetRandom(x,NULL);
365:     VecNormalize(x,NULL);
366:     *n   = 0.0;
367:     for (i = 0; i < normsamples; i++) {
368:       MatMult(A,x,y);
369:       VecNormalize(y,n);
370:       MatMultTranspose(A,y,z);
371:       VecNorm(z,NORM_2,&normz);
372:       VecDot(x,z,&dot);
373:       adot = PetscAbsScalar(dot);
374:       PetscInfo5(A,"%s norm it %D -> %g (%g %g)\n",NormTypes[normtype],i,(double)*n,(double)normz,(double)adot);
375:       if (normz <= adot) break;
376:       if (i < normsamples - 1) {
377:         Vec t;

379:         VecNormalize(z,NULL);
380:         t = x;
381:         x = z;
382:         z = t;
383:       }
384:     }
385:     VecDestroy(&x);
386:     VecDestroy(&y);
387:     VecDestroy(&z);
388:     break;
389:   default:
390:     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"%s norm not supported",NormTypes[normtype]);
391:   }
392:   PetscInfo3(A,"%s norm %g computed in %D iterations\n",NormTypes[normtype],(double)*n,i);
393:   return(0);
394: }

396: PETSC_EXTERN PetscErrorCode MatNorm_HARA(Mat A, NormType normtype, PetscReal* n)
397: {
399:   PetscBool      ishara;
400:   PetscInt       nmax = PETSC_DECIDE;

403:   PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);
404:   if (ishara) {
405:     Mat_HARA *a = (Mat_HARA*)A->data;

407:     nmax = a->norm_max_samples;
408:   }
409:   MatApproximateNorm_Private(A,normtype,nmax,n);
410:   return(0);
411: }

413: static PetscErrorCode MatMultNKernel_HARA(Mat A, PetscBool transA, Mat B, Mat C)
414: {
415:   Mat_HARA       *hara = (Mat_HARA*)A->data;
416:   haraHandle_t   handle = hara->handle->handle;
417:   PetscBool      boundtocpu = PETSC_TRUE;
418:   PetscScalar    *xx,*yy,*uxx,*uyy;
419:   PetscInt       blda,clda;
420:   PetscSF        bsf,csf;

424: #if defined(HARA_USE_GPU)
425:   boundtocpu = A->boundtocpu;
426: #endif
427:   MatDenseGetLDA(B,&blda);
428:   MatDenseGetLDA(C,&clda);
429:   if (hara->sf) {
430:     PetscInt n;

432:     PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
433:     PetscObjectQuery((PetscObject)B,"_mathara_vectorsf",(PetscObject*)&bsf);
434:     if (!bsf) {
435:       PetscSFGetVectorSF(hara->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);
436:       PetscObjectCompose((PetscObject)B,"_mathara_vectorsf",(PetscObject)bsf);
437:       PetscObjectDereference((PetscObject)bsf);
438:     }
439:     PetscObjectQuery((PetscObject)C,"_mathara_vectorsf",(PetscObject*)&csf);
440:     if (!csf) {
441:       PetscSFGetVectorSF(hara->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);
442:       PetscObjectCompose((PetscObject)C,"_mathara_vectorsf",(PetscObject)csf);
443:       PetscObjectDereference((PetscObject)csf);
444:     }
445:     blda = n;
446:     clda = n;
447:   }
448:   if (!boundtocpu) {
449: #if defined(HARA_USE_GPU)
450:     PetscBool ciscuda,biscuda;

452:     if (hara->sf) {
453:       PetscInt n;

455:       PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
456:       if (hara->xxs_gpu < B->cmap->n) { hara->xx_gpu->resize(n*B->cmap->N); hara->xxs_gpu = B->cmap->N; }
457:       if (hara->yys_gpu < B->cmap->n) { hara->yy_gpu->resize(n*B->cmap->N); hara->yys_gpu = B->cmap->N; }
458:     }
459:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
460:     PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
461:     if (!biscuda) {
462:       MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
463:     }
464:     PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
465:     if (!ciscuda) {
466:       C->assembled = PETSC_TRUE;
467:       MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
468:     }
469:     MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
470:     MatDenseCUDAGetArrayWrite(C,&yy);
471:     if (hara->sf) {
472:       uxx  = MatHaraGetThrustPointer(*hara->xx_gpu);
473:       uyy  = MatHaraGetThrustPointer(*hara->yy_gpu);
474:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
475:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
476:     } else {
477:       uxx = xx;
478:       uyy = yy;
479:     }
480:     if (hara->dist_hmatrix_gpu) {
481:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
482:       distributed_hgemv(/* transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
483:     } else {
484:       hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
485:     }
486:     MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
487:     if (hara->sf) {
488:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
489:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
490:     }
491:     MatDenseCUDARestoreArrayWrite(C,&yy);
492:     if (!biscuda) {
493:       MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
494:     }
495:     if (!ciscuda) {
496:       MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
497:     }
498: #endif
499:   } else {
500:     if (hara->sf) {
501:       PetscInt n;

503:       PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
504:       if (hara->xxs < B->cmap->n) { hara->xx->resize(n*B->cmap->N); hara->xxs = B->cmap->N; }
505:       if (hara->yys < B->cmap->n) { hara->yy->resize(n*B->cmap->N); hara->yys = B->cmap->N; }
506:     }
507:     MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
508:     MatDenseGetArrayWrite(C,&yy);
509:     if (hara->sf) {
510:       uxx  = MatHaraGetThrustPointer(*hara->xx);
511:       uyy  = MatHaraGetThrustPointer(*hara->yy);
512:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
513:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
514:     } else {
515:       uxx = xx;
516:       uyy = yy;
517:     }
518:     if (hara->dist_hmatrix) {
519:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
520:       distributed_hgemv(/*transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
521:     } else {
522:       hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
523:     }
524:     MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
525:     if (hara->sf) {
526:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
527:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
528:     }
529:     MatDenseRestoreArrayWrite(C,&yy);
530:   }
531:   return(0);
532: }

534: static PetscErrorCode MatProductNumeric_HARA(Mat C)
535: {
536:   Mat_Product    *product = C->product;

540:   MatCheckProduct(C,1);
541:   switch (product->type) {
542:   case MATPRODUCT_AB:
543:     MatMultNKernel_HARA(product->A,PETSC_FALSE,product->B,C);
544:     break;
545:   case MATPRODUCT_AtB:
546:     MatMultNKernel_HARA(product->A,PETSC_TRUE,product->B,C);
547:     break;
548:   default:
549:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
550:   }
551:   return(0);
552: }

554: static PetscErrorCode MatProductSymbolic_HARA(Mat C)
555: {
557:   Mat_Product    *product = C->product;
558:   PetscBool      cisdense;
559:   Mat            A,B;

562:   MatCheckProduct(C,1);
563:   A = product->A;
564:   B = product->B;
565:   switch (product->type) {
566:   case MATPRODUCT_AB:
567:     MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
568:     MatSetBlockSizesFromMats(C,product->A,product->B);
569:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
570:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
571:     MatSetUp(C);
572:     break;
573:   case MATPRODUCT_AtB:
574:     MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
575:     MatSetBlockSizesFromMats(C,product->A,product->B);
576:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
577:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
578:     MatSetUp(C);
579:     break;
580:   default:
581:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
582:   }
583:   C->ops->productsymbolic = NULL;
584:   C->ops->productnumeric = MatProductNumeric_HARA;
585:   return(0);
586: }

588: static PetscErrorCode MatProductSetFromOptions_HARA(Mat C)
589: {
591:   MatCheckProduct(C,1);
592:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
593:     C->ops->productsymbolic = MatProductSymbolic_HARA;
594:   }
595:   return(0);
596: }

598: static PetscErrorCode MatMultKernel_HARA(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
599: {
600:   Mat_HARA       *hara = (Mat_HARA*)A->data;
601:   haraHandle_t   handle = hara->handle->handle;
602:   PetscBool      boundtocpu = PETSC_TRUE;
603:   PetscInt       n;
604:   PetscScalar    *xx,*yy,*uxx,*uyy;

608: #if defined(HARA_USE_GPU)
609:   boundtocpu = A->boundtocpu;
610: #endif
611:   if (hara->sf) {
612:     PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
613:   } else n = A->rmap->n;
614:   if (!boundtocpu) {
615: #if defined(HARA_USE_GPU)
616:     VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
617:     if (sy == 0.0) {
618:       VecCUDAGetArrayWrite(y,&yy);
619:     } else {
620:       VecCUDAGetArray(y,&yy);
621:     }
622:     if (hara->sf) {
623:       uxx = MatHaraGetThrustPointer(*hara->xx_gpu);
624:       uyy = MatHaraGetThrustPointer(*hara->yy_gpu);

626:       PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
627:       PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
628:       if (sy != 0.0) {
629:         PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
630:         PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
631:       }
632:     } else {
633:       uxx = xx;
634:       uyy = yy;
635:     }
636:     if (hara->dist_hmatrix_gpu) {
637:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
638:       distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, hara->handle);
639:     } else {
640:       hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
641:     }
642:     VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
643:     if (hara->sf) {
644:       PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
645:       PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
646:     }
647:     if (sy == 0.0) {
648:       VecCUDARestoreArrayWrite(y,&yy);
649:     } else {
650:       VecCUDARestoreArray(y,&yy);
651:     }
652: #endif
653:   } else {
654:     VecGetArrayRead(x,(const PetscScalar**)&xx);
655:     if (sy == 0.0) {
656:       VecGetArrayWrite(y,&yy);
657:     } else {
658:       VecGetArray(y,&yy);
659:     }
660:     if (hara->sf) {
661:       uxx = MatHaraGetThrustPointer(*hara->xx);
662:       uyy = MatHaraGetThrustPointer(*hara->yy);

664:       PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
665:       PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
666:       if (sy != 0.0) {
667:         PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
668:         PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
669:       }
670:     } else {
671:       uxx = xx;
672:       uyy = yy;
673:     }
674:     if (hara->dist_hmatrix) {
675:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
676:       distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, n, sy, uyy, n, 1, hara->handle);
677:     } else {
678:       hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, n, sy, uyy, n, 1, handle);
679:     }
680:     VecRestoreArrayRead(x,(const PetscScalar**)&xx);
681:     if (hara->sf) {
682:       PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
683:       PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
684:     }
685:     if (sy == 0.0) {
686:       VecRestoreArrayWrite(y,&yy);
687:     } else {
688:       VecRestoreArray(y,&yy);
689:     }
690:   }
691:   return(0);
692: }

694: static PetscErrorCode MatMultTranspose_HARA(Mat A, Vec x, Vec y)
695: {

699:   MatMultKernel_HARA(A,x,0.0,y,PETSC_TRUE);
700:   return(0);
701: }

703: static PetscErrorCode MatMult_HARA(Mat A, Vec x, Vec y)
704: {

708:   MatMultKernel_HARA(A,x,0.0,y,PETSC_FALSE);
709:   return(0);
710: }

712: static PetscErrorCode MatMultTransposeAdd_HARA(Mat A, Vec x, Vec y, Vec z)
713: {

717:   VecCopy(y,z);
718:   MatMultKernel_HARA(A,x,1.0,z,PETSC_TRUE);
719:   return(0);
720: }

722: static PetscErrorCode MatMultAdd_HARA(Mat A, Vec x, Vec y, Vec z)
723: {

727:   VecCopy(y,z);
728:   MatMultKernel_HARA(A,x,1.0,z,PETSC_FALSE);
729:   return(0);
730: }

732: static PetscErrorCode MatScale_HARA(Mat A, PetscScalar s)
733: {
734:   Mat_HARA *a = (Mat_HARA*)A->data;

737:   a->s *= s;
738:   return(0);
739: }

741: static PetscErrorCode MatSetFromOptions_HARA(PetscOptionItems *PetscOptionsObject,Mat A)
742: {
743:   Mat_HARA       *a = (Mat_HARA*)A->data;

747:   PetscOptionsHead(PetscOptionsObject,"HARA options");
748:   PetscOptionsInt("-mat_hara_leafsize","Leaf size when constructed from kernel",NULL,a->leafsize,&a->leafsize,NULL);
749:   PetscOptionsReal("-mat_hara_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
750:   PetscOptionsInt("-mat_hara_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
751:   PetscOptionsInt("-mat_hara_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
752:   PetscOptionsInt("-mat_hara_samples","Number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
753:   PetscOptionsReal("-mat_hara_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
754:   PetscOptionsBool("-mat_hara_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
755:   PetscOptionsTail();
756:   return(0);
757: }

759: static PetscErrorCode MatHaraSetCoords_HARA(Mat,PetscInt,const PetscReal[],MatHaraKernel,void*);

761: static PetscErrorCode MatHaraInferCoordinates_Private(Mat A)
762: {
763:   Mat_HARA          *a = (Mat_HARA*)A->data;
764:   Vec               c;
765:   PetscInt          spacedim;
766:   const PetscScalar *coords;
767:   PetscErrorCode    ierr;

770:   if (a->spacedim) return(0);
771:   PetscObjectQuery((PetscObject)A,"__mathara_coords",(PetscObject*)&c);
772:   if (!c && a->sampler) {
773:     Mat S = a->sampler->GetSamplingMat();

775:     PetscObjectQuery((PetscObject)S,"__mathara_coords",(PetscObject*)&c);
776:     if (!c) {
777:       PetscBool ishara;

779:       PetscObjectTypeCompare((PetscObject)S,MATHARA,&ishara);
780:       if (ishara) {
781:         Mat_HARA *s = (Mat_HARA*)S->data;

783:         a->spacedim = s->spacedim;
784:         if (s->ptcloud1) {
785:           a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*s->ptcloud1);
786:         } else if (s->ptcloud2) {
787:           a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*s->ptcloud2);
788:         } else if (s->ptcloud3) {
789:           a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*s->ptcloud3);
790:         }
791:       }
792:     }
793:   }
794:   if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
795:   VecGetArrayRead(c,&coords);
796:   VecGetBlockSize(c,&spacedim);
797:   MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);
798:   VecRestoreArrayRead(c,&coords);
799:   return(0);
800: }

802: static PetscErrorCode MatSetUpMultiply_HARA(Mat A)
803: {
804:   MPI_Comm       comm;
805:   PetscMPIInt    size;
807:   Mat_HARA       *a = (Mat_HARA*)A->data;
808:   IS             is;
809:   PetscInt       n,*idx;
810:   int            *iidx;
811:   PetscCopyMode  own;
812:   PetscBool      rid;

815:   if (a->multsetup) return(0);
816:   PetscObjectGetComm((PetscObject)A,&comm);
817:   MPI_Comm_size(comm,&size);
818:   if (size > 1) {
819:     iidx = MatHaraGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
820:     n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
821:   } else {
822:     n    = a->hmatrix->u_basis_tree.index_map.size();
823:     iidx = MatHaraGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
824:   }
825:   if (PetscDefined(USE_64BIT_INDICES)) {
826:     PetscInt i;

828:     own  = PETSC_OWN_POINTER;
829:     PetscMalloc1(n,&idx);
830:     for (i=0;i<n;i++) idx[i] = iidx[i];
831:   } else {
832:     own  = PETSC_USE_POINTER;
833:     idx  = iidx;
834:   }
835:   ISCreateGeneral(comm,n,idx,own,&is);
836:   ISIdentity(is,&rid);
837:   if (!rid) {
838:     PetscSFCreate(comm,&a->sf);
839:     PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
840: #if defined(HARA_USE_GPU)
841:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
842:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
843:     a->xxs_gpu = 1;
844:     a->yys_gpu = 1;
845: #endif
846:     a->xx  = new thrust::host_vector<PetscScalar>(n);
847:     a->yy  = new thrust::host_vector<PetscScalar>(n);
848:     a->xxs = 1;
849:     a->yys = 1;
850:   }
851:   ISDestroy(&is);
852:   a->multsetup = PETSC_TRUE;
853:   return(0);
854: }

856: static PetscErrorCode MatAssemblyEnd_HARA(Mat A, MatAssemblyType asstype)
857: {
858:   Mat_HARA       *a = (Mat_HARA*)A->data;
859:   haraHandle_t   handle = a->handle->handle;
860:   PetscBool      kernel = PETSC_FALSE;
861:   PetscBool      boundtocpu = PETSC_TRUE;
862:   MPI_Comm       comm;
863:   PetscMPIInt    size;

867:   PetscObjectGetComm((PetscObject)A,&comm);
868:   MPI_Comm_size(comm,&size);
869:   /* TODO REUSABILITY of geometric construction */
870:   delete a->hmatrix;
871:   delete a->dist_hmatrix;
872: #if defined(HARA_USE_GPU)
873:   delete a->hmatrix_gpu;
874:   delete a->dist_hmatrix_gpu;
875: #endif
876:   /* TODO: other? */
877:   BoxCenterAdmissibility<Hara_Real,1> adm1(a->eta,a->leafsize);
878:   BoxCenterAdmissibility<Hara_Real,2> adm2(a->eta,a->leafsize);
879:   BoxCenterAdmissibility<Hara_Real,3> adm3(a->eta,a->leafsize);
880:   if (size > 1) {
881:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
882:   } else {
883:     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
884:   }
885:   MatHaraInferCoordinates_Private(A);
886:   switch (a->spacedim) {
887:   case 1:
888:     if (!a->ptcloud1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
889:     if (a->kernel1) {
890:       kernel = PETSC_TRUE;
891:       if (size > 1) {
892:         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
893:       } else {
894:         buildHMatrix(*a->hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord,a->basisord);
895:       }
896:     } else {
897:       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
898:       buildHMatrixStructure(*a->hmatrix,*a->ptcloud1,adm1,a->leafsize,0,0);
899:     }
900:     break;
901:   case 2:
902:     if (!a->ptcloud2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
903:     if (a->kernel2) {
904:       kernel = PETSC_TRUE;
905:       if (size > 1) {
906:         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
907:       } else {
908:         buildHMatrix(*a->hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord,a->basisord);
909:       }
910:     } else {
911:       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
912:       buildHMatrixStructure(*a->hmatrix,*a->ptcloud2,adm2,a->leafsize,0,0);
913:     }
914:     break;
915:   case 3:
916:     if (!a->ptcloud3) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
917:     if (a->kernel3) {
918:       kernel = PETSC_TRUE;
919:       if (size > 1) {
920:         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
921:       } else {
922:         buildHMatrix(*a->hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord,a->basisord);
923:       }
924:     } else {
925:       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
926:       buildHMatrixStructure(*a->hmatrix,*a->ptcloud3,adm3,a->leafsize,0,0);
927:     }
928:     break;
929:   default:
930:     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",a->spacedim);
931:   }

933:   MatSetUpMultiply_HARA(A);

935: #if defined(HARA_USE_GPU)
936:   boundtocpu = A->boundtocpu;
937:   if (!boundtocpu) {
938:     if (size > 1) {
939:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
940:     } else {
941:       a->hmatrix_gpu = new GPU_HMatrix(*a->hmatrix);
942:     }
943:   }
944: #endif
945:   if (size == 1) {
946:     if (!kernel && a->sampler) {
947:       PetscReal Anorm;
948:       bool      verbose = false;

950:       MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,PETSC_DECIDE,&Anorm);
951:       if (boundtocpu) {
952:         a->sampler->SetGPUSampling(false);
953:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
954: #if defined(HARA_USE_GPU)
955:       } else {
956:         a->sampler->SetGPUSampling(true);
957:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
958: #endif
959:       }
960:     }
961:   }
962: #if defined(HARA_USE_GPU)
963:   if (kernel) A->offloadmask = PETSC_OFFLOAD_BOTH;
964:   else A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
965: #endif

967:   if (!a->s) a->s = 1.0;
968:   A->assembled = PETSC_TRUE;

970:   if (a->sampler) {
971:     PetscBool check = a->check_construction;

973:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_hara_check",&check,NULL);
974:     if (check) {
975:       Mat       E,Ae;
976:       PetscReal n1,ni,n2;
977:       PetscReal n1A,niA,n2A;
978:       void      (*normfunc)(void);

980:       Ae   = a->sampler->GetSamplingMat();
981:       MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
982:       MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_HARA);
983:       MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
984:       MatNorm(E,NORM_1,&n1);
985:       MatNorm(E,NORM_INFINITY,&ni);
986:       MatNorm(E,NORM_2,&n2);

988:       MatGetOperation(Ae,MATOP_NORM,&normfunc);
989:       MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_HARA);
990:       MatNorm(Ae,NORM_1,&n1A);
991:       MatNorm(Ae,NORM_INFINITY,&niA);
992:       MatNorm(Ae,NORM_2,&n2A);
993:       n1A  = PetscMax(n1A,PETSC_SMALL);
994:       n2A  = PetscMax(n2A,PETSC_SMALL);
995:       niA  = PetscMax(niA,PETSC_SMALL);
996:       MatSetOperation(Ae,MATOP_NORM,normfunc);
997:       PetscPrintf(PetscObjectComm((PetscObject)A),"MATHARA construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
998:       MatDestroy(&E);
999:     }
1000:   }
1001:   return(0);
1002: }

1004: static PetscErrorCode MatZeroEntries_HARA(Mat A)
1005: {
1007:   PetscMPIInt    size;
1008:   Mat_HARA       *a = (Mat_HARA*)A->data;

1011:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1012:   if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1013:   else {
1014:     a->hmatrix->clearData();
1015: #if defined(HARA_USE_GPU)
1016:     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017: #endif
1018:   }
1019:   return(0);
1020: }

1022: static PetscErrorCode MatDuplicate_HARA(Mat B, MatDuplicateOption op, Mat *nA)
1023: {
1024:   Mat            A;
1025:   Mat_HARA       *a, *b = (Mat_HARA*)B->data;
1026: #if defined(PETSC_HAVE_CUDA)
1027:   PetscBool      iscpu = PETSC_FALSE;
1028: #else
1029:   PetscBool      iscpu = PETSC_TRUE;
1030: #endif
1032:   MPI_Comm       comm;

1035:   PetscObjectGetComm((PetscObject)B,&comm);
1036:   MatCreate(comm,&A);
1037:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1038:   MatSetType(A,MATHARA);
1039:   MatPropagateSymmetryOptions(B,A);

1041:   a = (Mat_HARA*)A->data;
1042:   a->s = b->s;
1043:   a->spacedim = b->spacedim;
1044:   if (b->ptcloud1) {
1045:     a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*b->ptcloud1);
1046:   } else if (b->ptcloud2) {
1047:     a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*b->ptcloud2);
1048:   } else if (b->ptcloud3) {
1049:     a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*b->ptcloud3);
1050:   }
1051:   if (op == MAT_COPY_VALUES) {
1052:     if (b->kernel1) {
1053:       a->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(*b->kernel1);
1054:     } else if (b->kernel2) {
1055:       a->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(*b->kernel2);
1056:     } else if (b->kernel3) {
1057:       a->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(*b->kernel3);
1058:     }
1059:   }

1061:   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1062: #if defined(HARA_USE_GPU)
1063:   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1064: #endif
1065:   if (b->hmatrix) {
1066:     a->hmatrix = new HMatrix(*b->hmatrix);
1067:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068:   }
1069: #if defined(HARA_USE_GPU)
1070:   if (b->hmatrix_gpu) {
1071:     a->hmatrix_gpu = new GPU_HMatrix(*b->hmatrix_gpu);
1072:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073:   }
1074: #endif

1076:   MatSetUp(A);
1077:   MatSetUpMultiply_HARA(A);
1078:   if (op == MAT_COPY_VALUES) {
1079:     A->assembled = PETSC_TRUE;
1080: #if defined(PETSC_HAVE_CUDA)
1081:     iscpu = B->boundtocpu;
1082: #endif
1083:   }
1084:   MatBindToCPU(A,iscpu);

1086:   *nA = A;
1087:   return(0);
1088: }

1090: static PetscErrorCode MatView_HARA(Mat A, PetscViewer view)
1091: {
1092:   Mat_HARA          *hara = (Mat_HARA*)A->data;
1093:   PetscBool         isascii;
1094:   PetscErrorCode    ierr;
1095:   PetscMPIInt       size;
1096:   PetscViewerFormat format;

1099:   PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1100:   PetscViewerGetFormat(view,&format);
1101:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1102:   if (isascii) {
1103:     PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",hara->sampler ? "Mat" : (hara->spacedim ? "Kernel" : "None"));
1104:     PetscViewerASCIIPrintf(view,"  PointCloud dim %D\n",hara->spacedim);
1105:     PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %D, eta %g\n",hara->leafsize,(double)hara->eta);
1106:     if (hara->sampler) {
1107:       PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %D, samples %D, tolerance %g\n",hara->max_rank,hara->bs,(double)hara->rtol);
1108:     } else {
1109:       PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %D\n",hara->basisord);
1110:     }
1111:     if (size == 1) {
1112:       double dense_mem_cpu = hara->hmatrix ? hara->hmatrix->getDenseMemoryUsage() : 0;
1113:       double low_rank_cpu = hara->hmatrix ? hara->hmatrix->getLowRankMemoryUsage() : 0;
1114: #if defined(HARA_USE_GPU)
1115:       double dense_mem_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getDenseMemoryUsage() : 0;
1116:       double low_rank_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1117: #endif
1118:       PetscViewerASCIIPrintf(view,"  Memory consumption (CPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1119: #if defined(HARA_USE_GPU)
1120:       PetscViewerASCIIPrintf(view,"  Memory consumption (GPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1121: #endif
1122:     }
1123:   }
1124:   return(0);
1125: }

1127: static PetscErrorCode MatHaraSetCoords_HARA(Mat A, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel, void *kernelctx)
1128: {
1129:   Mat_HARA       *hara = (Mat_HARA*)A->data;
1130:   PetscReal      *gcoords;
1131:   PetscInt       N;
1132:   MPI_Comm       comm;
1133:   PetscMPIInt    size;
1134:   PetscBool      cong;

1138:   PetscLayoutSetUp(A->rmap);
1139:   PetscLayoutSetUp(A->cmap);
1140:   PetscObjectGetComm((PetscObject)A,&comm);
1141:   MatHasCongruentLayouts(A,&cong);
1142:   if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1143:   N    = A->rmap->N;
1144:   MPI_Comm_size(comm,&size);
1145:   if (size > 1) {
1146:     PetscSF      sf;
1147:     MPI_Datatype dtype;

1149:     MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1150:     MPI_Type_commit(&dtype);

1152:     PetscSFCreate(comm,&sf);
1153:     PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1154:     PetscMalloc1(spacedim*N,&gcoords);
1155:     PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1156:     PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1157:     PetscSFDestroy(&sf);
1158:     MPI_Type_free(&dtype);
1159:   } else gcoords = (PetscReal*)coords;

1161:   delete hara->ptcloud1;
1162:   delete hara->ptcloud2;
1163:   delete hara->ptcloud3;
1164:   delete hara->kernel1;
1165:   delete hara->kernel2;
1166:   delete hara->kernel3;
1167:   hara->spacedim = spacedim;
1168:   switch (spacedim) {
1169:   case 1:
1170:     hara->ptcloud1 = new PetscPointCloud1D<PetscReal>(N,gcoords);
1171:     if (kernel) hara->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(kernel,kernelctx);
1172:     break;
1173:   case 2:
1174:     hara->ptcloud2 = new PetscPointCloud2D<PetscReal>(N,gcoords);
1175:     if (kernel) hara->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(kernel,kernelctx);
1176:     break;
1177:   case 3:
1178:     hara->ptcloud3 = new PetscPointCloud3D<PetscReal>(N,gcoords);
1179:     if (kernel) hara->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(kernel,kernelctx);
1180:     break;
1181:   default:
1182:     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",hara->spacedim);
1183:   }
1184:   if (gcoords != coords) { PetscFree(gcoords); }
1185:   A->preallocated = PETSC_TRUE;
1186:   return(0);
1187: }

1189: PETSC_EXTERN PetscErrorCode MatCreate_HARA(Mat A)
1190: {
1191:   Mat_HARA       *a;
1193:   PetscMPIInt    size;

1196:   PetscNewLog(A,&a);
1197:   A->data = (void*)a;

1199:   a->eta              = 0.9;
1200:   a->leafsize         = 32;
1201:   a->basisord         = 4;
1202:   a->max_rank         = 64;
1203:   a->bs               = 32;
1204:   a->rtol             = 1.e-4;
1205:   a->s                = 1.0;
1206:   a->norm_max_samples = 10;
1207:   haraCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));

1209:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1210:   PetscObjectChangeTypeName((PetscObject)A,MATHARA);
1211:   PetscMemzero(A->ops,sizeof(struct _MatOps));

1213:   A->ops->destroy          = MatDestroy_HARA;
1214:   A->ops->view             = MatView_HARA;
1215:   A->ops->assemblyend      = MatAssemblyEnd_HARA;
1216:   A->ops->mult             = MatMult_HARA;
1217:   A->ops->multtranspose    = MatMultTranspose_HARA;
1218:   A->ops->multadd          = MatMultAdd_HARA;
1219:   A->ops->multtransposeadd = MatMultTransposeAdd_HARA;
1220:   A->ops->scale            = MatScale_HARA;
1221:   A->ops->duplicate        = MatDuplicate_HARA;
1222:   A->ops->setfromoptions   = MatSetFromOptions_HARA;
1223:   A->ops->norm             = MatNorm_HARA;
1224:   A->ops->zeroentries      = MatZeroEntries_HARA;

1226:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",MatProductSetFromOptions_HARA);
1227:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",MatProductSetFromOptions_HARA);
1228:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",MatProductSetFromOptions_HARA);
1229:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",MatProductSetFromOptions_HARA);
1230: #if defined(PETSC_HAVE_CUDA)
1231:   PetscFree(A->defaultvectype);
1232:   PetscStrallocpy(VECCUDA,&A->defaultvectype);
1233: #endif
1234:   return(0);
1235: }

1237: PetscErrorCode MatHaraSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1238: {
1239:   PetscBool      ishara;

1246:   PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);
1247:   if (ishara) {
1248:     Mat_HARA *a = (Mat_HARA*)A->data;


1251:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1252:     a->sampler->SetSamplingMat(B);
1253:     if (bs > 0) a->bs = bs;
1254:     if (tol > 0.) a->rtol = tol;
1255:     delete a->kernel1;
1256:     delete a->kernel2;
1257:     delete a->kernel3;
1258:   }
1259:   return(0);
1260: }

1262: PetscErrorCode MatCreateHaraFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel,void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1263: {
1264:   Mat            A;
1265:   Mat_HARA       *hara;
1266: #if defined(PETSC_HAVE_CUDA)
1267:   PetscBool      iscpu = PETSC_FALSE;
1268: #else
1269:   PetscBool      iscpu = PETSC_TRUE;
1270: #endif

1274:   MatCreate(comm,&A);
1275:   MatSetSizes(A,m,n,M,N);
1276:   MatSetType(A,MATHARA);
1277:   MatBindToCPU(A,iscpu);
1278:   MatHaraSetCoords_HARA(A,spacedim,coords,kernel,kernelctx);

1280:   hara = (Mat_HARA*)A->data;
1281:   if (eta > 0.) hara->eta = eta;
1282:   if (leafsize > 0) hara->leafsize = leafsize;
1283:   if (basisord > 0) hara->basisord = basisord;

1285:   *nA = A;
1286:   return(0);
1287: }

1289: PetscErrorCode MatCreateHaraFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1290: {
1291:   Mat            A;
1292:   Mat_HARA       *hara;
1293:   MPI_Comm       comm;
1294: #if defined(PETSC_HAVE_CUDA)
1295:   PetscBool      iscpu = PETSC_FALSE;
1296: #else
1297:   PetscBool      iscpu = PETSC_TRUE;
1298: #endif

1305:   PetscObjectGetComm((PetscObject)B,&comm);
1306:   MatCreate(comm,&A);
1307:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1308:   MatSetType(A,MATHARA);
1309:   MatBindToCPU(A,iscpu);
1310:   if (spacedim) {
1311:     MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);
1312:   }
1313:   MatPropagateSymmetryOptions(B,A);
1314:   /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1316:   hara = (Mat_HARA*)A->data;
1317:   hara->sampler = new PetscMatrixSampler(B);
1318:   if (eta > 0.) hara->eta = eta;
1319:   if (leafsize > 0) hara->leafsize = leafsize;
1320:   if (maxrank > 0) hara->max_rank = maxrank;
1321:   if (bs > 0) hara->bs = bs;
1322:   if (rtol > 0.) hara->rtol = rtol;

1324:   *nA = A;
1325:   A->preallocated = PETSC_TRUE;
1326:   return(0);
1327: }