Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4: #include <h2opus.h>
  5: #if defined(H2OPUS_USE_MPI)
  6: #include <h2opus/distributed/distributed_h2opus_handle.h>
  7: #include <h2opus/distributed/distributed_geometric_construction.h>
  8: #include <h2opus/distributed/distributed_hgemv.h>
  9: #include <h2opus/distributed/distributed_horthog.h>
 10: #include <h2opus/distributed/distributed_hcompress.h>
 11: #endif
 12: #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
 20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
 21: PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
 22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
 23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);

 25: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 27: /* Use GPU only if H2OPUS is configured for GPU */
 28: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 29: #define PETSC_H2OPUS_USE_GPU
 30: #endif
 31: #if defined(PETSC_H2OPUS_USE_GPU)
 32: #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
 33: #else
 34: #define MatH2OpusUpdateIfNeeded(A,B) 0
 35: #endif

 37: // TODO H2OPUS:
 38: // DistributedHMatrix
 39: //   unsymmetric ?
 40: //   transpose for distributed_hgemv?
 41: //   clearData()
 42: // Unify interface for sequential and parallel?
 43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 44: //
 45: template <class T> class PetscPointCloud : public H2OpusDataSet<T>
 46: {
 47:   private:
 48:     int dimension;
 49:     size_t num_points;
 50:     std::vector<T> pts;

 52:   public:
 53:     PetscPointCloud(int dim, size_t num_pts, const T coords[])
 54:     {
 55:       dim = dim > 0 ? dim : 1;
 56:       this->dimension = dim;
 57:       this->num_points = num_pts;

 59:       pts.resize(num_pts*dim);
 60:       if (coords) {
 61:         for (size_t n = 0; n < num_points; n++)
 62:           for (int i = 0; i < dim; i++)
 63:             pts[n*dim + i] = coords[n*dim + i];
 64:       } else {
 65:         PetscReal h = 1./(num_points - 1);
 66:         for (size_t n = 0; n < num_points; n++)
 67:           for (int i = 0; i < dim; i++)
 68:             pts[n*dim + i] = i*h;
 69:       }
 70:     }

 72:     PetscPointCloud(const PetscPointCloud<T>& other)
 73:     {
 74:       size_t N = other.dimension * other.num_points;
 75:       this->dimension = other.dimension;
 76:       this->num_points = other.num_points;
 77:       this->pts.resize(N);
 78:       for (size_t i = 0; i < N; i++)
 79:         this->pts[i] = other.pts[i];
 80:     }

 82:     int getDimension() const
 83:     {
 84:         return dimension;
 85:     }

 87:     size_t getDataSetSize() const
 88:     {
 89:         return num_points;
 90:     }

 92:     T getDataPoint(size_t idx, int dim) const
 93:     {
 94:         assert(dim < dimension && idx < num_points);
 95:         return pts[idx*dimension + dim];
 96:     }

 98:     void Print(std::ostream& out = std::cout)
 99:     {
100:        out << "Dimension: " << dimension << std::endl;
101:        out << "NumPoints: " << num_points << std::endl;
102:        for (size_t n = 0; n < num_points; n++) {
103:          for (int d = 0; d < dimension; d++)
104:            out << pts[n*dimension + d] << " ";
105:          out << std::endl;
106:        }
107:     }
108: };

110: template<class T> class PetscFunctionGenerator
111: {
112: private:
113:   MatH2OpusKernel k;
114:   int             dim;
115:   void            *ctx;

117: public:
118:     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
119:     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
120:     T operator()(PetscReal *pt1, PetscReal *pt2)
121:     {
122:         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
123:     }
124: };

126: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

128: /* just to not clutter the code */
129: #if !defined(H2OPUS_USE_GPU)
130: typedef HMatrix HMatrix_GPU;
131: #if defined(H2OPUS_USE_MPI)
132: typedef DistributedHMatrix DistributedHMatrix_GPU;
133: #endif
134: #endif

136: typedef struct {
137: #if defined(H2OPUS_USE_MPI)
138:   distributedH2OpusHandle_t handle;
139: #else
140:   h2opusHandle_t handle;
141: #endif
142:   /* Sequential and parallel matrices are two different classes at the moment */
143:   HMatrix *hmatrix;
144: #if defined(H2OPUS_USE_MPI)
145:   DistributedHMatrix *dist_hmatrix;
146: #else
147:   HMatrix *dist_hmatrix; /* just to not clutter the code */
148: #endif
149:   /* May use permutations */
150:   PetscSF sf;
151:   PetscLayout h2opus_rmap, h2opus_cmap;
152:   IS h2opus_indexmap;
153:   thrust::host_vector<PetscScalar> *xx,*yy;
154:   PetscInt xxs,yys;
155:   PetscBool multsetup;

157:   /* GPU */
158:   HMatrix_GPU *hmatrix_gpu;
159: #if defined(H2OPUS_USE_MPI)
160:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
161: #else
162:   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
163: #endif
164: #if defined(PETSC_H2OPUS_USE_GPU)
165:   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
166:   PetscInt xxs_gpu,yys_gpu;
167: #endif

169:   /* construction from matvecs */
170:   PetscMatrixSampler* sampler;
171:   PetscBool nativemult;

173:   /* Admissibility */
174:   PetscReal eta;
175:   PetscInt  leafsize;

177:   /* for dof reordering */
178:   PetscPointCloud<PetscReal> *ptcloud;

180:   /* kernel for generating matrix entries */
181:   PetscFunctionGenerator<PetscScalar> *kernel;

183:   /* basis orthogonalized? */
184:   PetscBool orthogonal;

186:   /* customization */
187:   PetscInt  basisord;
188:   PetscInt  max_rank;
189:   PetscInt  bs;
190:   PetscReal rtol;
191:   PetscInt  norm_max_samples;
192:   PetscBool check_construction;
193:   PetscBool hara_verbose;

195:   /* keeps track of MatScale values */
196:   PetscScalar s;
197: } Mat_H2OPUS;

199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

203: #if defined(H2OPUS_USE_MPI)
204:   h2opusDestroyDistributedHandle(a->handle);
205: #else
206:   h2opusDestroyHandle(a->handle);
207: #endif
208:   delete a->dist_hmatrix;
209:   delete a->hmatrix;
210:   PetscSFDestroy(&a->sf);
211:   PetscLayoutDestroy(&a->h2opus_rmap);
212:   PetscLayoutDestroy(&a->h2opus_cmap);
213:   ISDestroy(&a->h2opus_indexmap);
214:   delete a->xx;
215:   delete a->yy;
216:   delete a->hmatrix_gpu;
217:   delete a->dist_hmatrix_gpu;
218: #if defined(PETSC_H2OPUS_USE_GPU)
219:   delete a->xx_gpu;
220:   delete a->yy_gpu;
221: #endif
222:   delete a->sampler;
223:   delete a->ptcloud;
224:   delete a->kernel;
225:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL);
226:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL);
227:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL);
228:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL);
229:   PetscObjectChangeTypeName((PetscObject)A,NULL);
230:   PetscFree(A->data);
231:   return 0;
232: }

234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
237:   PetscBool      ish2opus;

241:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
242:   if (ish2opus) {
243:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245:         PetscLayout t;
246:         t = A->rmap;
247:         A->rmap = a->h2opus_rmap;
248:         a->h2opus_rmap = t;
249:         t = A->cmap;
250:         A->cmap = a->h2opus_cmap;
251:         a->h2opus_cmap = t;
252:       }
253:     }
254:     a->nativemult = nm;
255:   }
256:   return 0;
257: }

259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
262:   PetscBool      ish2opus;

266:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
268:   *nm = a->nativemult;
269:   return 0;
270: }

272: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
273: {
274:   PetscBool      ish2opus;
275:   PetscInt       nmax = PETSC_DECIDE;
276:   Mat_H2OPUS     *a = NULL;
277:   PetscBool      mult = PETSC_FALSE;

279:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
280:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
281:     a = (Mat_H2OPUS*)A->data;

283:     nmax = a->norm_max_samples;
284:     mult = a->nativemult;
285:     MatH2OpusSetNativeMult(A,PETSC_TRUE);
286:   } else {
287:     PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL);
288:   }
289:   MatApproximateNorm_Private(A,normtype,nmax,n);
290:   if (a) MatH2OpusSetNativeMult(A,mult);
291:   return 0;
292: }

294: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
295: {
296:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
297:   PetscInt       n;
298:   PetscBool      boundtocpu = PETSC_TRUE;

300: #if defined(PETSC_H2OPUS_USE_GPU)
301:   boundtocpu = A->boundtocpu;
302: #endif
303:   PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
304:   if (boundtocpu) {
305:     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
306:     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
307:   }
308: #if defined(PETSC_H2OPUS_USE_GPU)
309:   if (!boundtocpu) {
310:     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
311:     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
312:   }
313: #endif
314:   return 0;
315: }

317: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
318: {
319:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
320: #if defined(H2OPUS_USE_MPI)
321:   h2opusHandle_t handle = h2opus->handle->handle;
322: #else
323:   h2opusHandle_t handle = h2opus->handle;
324: #endif
325:   PetscBool      boundtocpu = PETSC_TRUE;
326:   PetscScalar    *xx,*yy,*uxx,*uyy;
327:   PetscInt       blda,clda;
328:   PetscMPIInt    size;
329:   PetscSF        bsf,csf;
330:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

332:   HLibProfile::clear();
333: #if defined(PETSC_H2OPUS_USE_GPU)
334:   boundtocpu = A->boundtocpu;
335: #endif
336:   MatDenseGetLDA(B,&blda);
337:   MatDenseGetLDA(C,&clda);
338:   if (usesf) {
339:     PetscInt n;

341:     MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf);
342:     MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf);

344:     MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N);
345:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
346:     blda = n;
347:     clda = n;
348:   }
349:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
350:   if (boundtocpu) {
351:     MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
352:     MatDenseGetArrayWrite(C,&yy);
353:     if (usesf) {
354:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
355:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
356:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
357:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
358:     } else {
359:       uxx = xx;
360:       uyy = yy;
361:     }
362:     if (size > 1) {
365: #if defined(H2OPUS_USE_MPI)
366:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
367: #endif
368:     } else {
370:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
371:     }
372:     MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
373:     if (usesf) {
374:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
375:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
376:     }
377:     MatDenseRestoreArrayWrite(C,&yy);
378: #if defined(PETSC_H2OPUS_USE_GPU)
379:   } else {
380:     PetscBool ciscuda,biscuda;

382:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
383:     PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
384:     if (!biscuda) {
385:       MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
386:     }
387:     PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
388:     if (!ciscuda) {
389:       C->assembled = PETSC_TRUE;
390:       MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
391:     }
392:     MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
393:     MatDenseCUDAGetArrayWrite(C,&yy);
394:     if (usesf) {
395:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
396:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
397:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
398:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
399:     } else {
400:       uxx = xx;
401:       uyy = yy;
402:     }
403:     PetscLogGpuTimeBegin();
404:     if (size > 1) {
407: #if defined(H2OPUS_USE_MPI)
408:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
409: #endif
410:     } else {
412:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
413:     }
414:     PetscLogGpuTimeEnd();
415:     MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
416:     if (usesf) {
417:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
418:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
419:     }
420:     MatDenseCUDARestoreArrayWrite(C,&yy);
421:     if (!biscuda) {
422:       MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
423:     }
424:     if (!ciscuda) {
425:       MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
426:     }
427: #endif
428:   }
429:   { /* log flops */
430:     double gops,time,perf,dev;
431:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
432: #if defined(PETSC_H2OPUS_USE_GPU)
433:     if (boundtocpu) {
434:       PetscLogFlops(1e9*gops);
435:     } else {
436:       PetscLogGpuFlops(1e9*gops);
437:     }
438: #else
439:     PetscLogFlops(1e9*gops);
440: #endif
441:   }
442:   return 0;
443: }

445: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
446: {
447:   Mat_Product    *product = C->product;

449:   MatCheckProduct(C,1);
450:   switch (product->type) {
451:   case MATPRODUCT_AB:
452:     MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C);
453:     break;
454:   case MATPRODUCT_AtB:
455:     MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C);
456:     break;
457:   default:
458:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
459:   }
460:   return 0;
461: }

463: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
464: {
465:   Mat_Product    *product = C->product;
466:   PetscBool      cisdense;
467:   Mat            A,B;

469:   MatCheckProduct(C,1);
470:   A = product->A;
471:   B = product->B;
472:   switch (product->type) {
473:   case MATPRODUCT_AB:
474:     MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
475:     MatSetBlockSizesFromMats(C,product->A,product->B);
476:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
477:     if (!cisdense) MatSetType(C,((PetscObject)product->B)->type_name);
478:     MatSetUp(C);
479:     break;
480:   case MATPRODUCT_AtB:
481:     MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
482:     MatSetBlockSizesFromMats(C,product->A,product->B);
483:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
484:     if (!cisdense) MatSetType(C,((PetscObject)product->B)->type_name);
485:     MatSetUp(C);
486:     break;
487:   default:
488:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
489:   }
490:   C->ops->productsymbolic = NULL;
491:   C->ops->productnumeric = MatProductNumeric_H2OPUS;
492:   return 0;
493: }

495: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
496: {
497:   MatCheckProduct(C,1);
498:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
499:     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
500:   }
501:   return 0;
502: }

504: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
505: {
506:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
507: #if defined(H2OPUS_USE_MPI)
508:   h2opusHandle_t handle = h2opus->handle->handle;
509: #else
510:   h2opusHandle_t handle = h2opus->handle;
511: #endif
512:   PetscBool      boundtocpu = PETSC_TRUE;
513:   PetscInt       n;
514:   PetscScalar    *xx,*yy,*uxx,*uyy;
515:   PetscMPIInt    size;
516:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

518:   HLibProfile::clear();
519:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
520: #if defined(PETSC_H2OPUS_USE_GPU)
521:   boundtocpu = A->boundtocpu;
522: #endif
523:   if (usesf) {
524:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
525:   } else n = A->rmap->n;
526:   if (boundtocpu) {
527:     VecGetArrayRead(x,(const PetscScalar**)&xx);
528:     if (sy == 0.0) {
529:       VecGetArrayWrite(y,&yy);
530:     } else {
531:       VecGetArray(y,&yy);
532:     }
533:     if (usesf) {
534:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
535:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

537:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
538:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
539:       if (sy != 0.0) {
540:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
541:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
542:       }
543:     } else {
544:       uxx = xx;
545:       uyy = yy;
546:     }
547:     if (size > 1) {
550: #if defined(H2OPUS_USE_MPI)
551:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
552: #endif
553:     } else {
555:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
556:     }
557:     VecRestoreArrayRead(x,(const PetscScalar**)&xx);
558:     if (usesf) {
559:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
560:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
561:     }
562:     if (sy == 0.0) {
563:       VecRestoreArrayWrite(y,&yy);
564:     } else {
565:       VecRestoreArray(y,&yy);
566:     }
567: #if defined(PETSC_H2OPUS_USE_GPU)
568:   } else {
569:     VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
570:     if (sy == 0.0) {
571:       VecCUDAGetArrayWrite(y,&yy);
572:     } else {
573:       VecCUDAGetArray(y,&yy);
574:     }
575:     if (usesf) {
576:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
577:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

579:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
580:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
581:       if (sy != 0.0) {
582:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
583:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
584:       }
585:     } else {
586:       uxx = xx;
587:       uyy = yy;
588:     }
589:     PetscLogGpuTimeBegin();
590:     if (size > 1) {
593: #if defined(H2OPUS_USE_MPI)
594:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
595: #endif
596:     } else {
598:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
599:     }
600:     PetscLogGpuTimeEnd();
601:     VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
602:     if (usesf) {
603:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
604:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
605:     }
606:     if (sy == 0.0) {
607:       VecCUDARestoreArrayWrite(y,&yy);
608:     } else {
609:       VecCUDARestoreArray(y,&yy);
610:     }
611: #endif
612:   }
613:   { /* log flops */
614:     double gops,time,perf,dev;
615:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
616: #if defined(PETSC_H2OPUS_USE_GPU)
617:     if (boundtocpu) {
618:       PetscLogFlops(1e9*gops);
619:     } else {
620:       PetscLogGpuFlops(1e9*gops);
621:     }
622: #else
623:     PetscLogFlops(1e9*gops);
624: #endif
625:   }
626:   return 0;
627: }

629: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
630: {
631:   PetscBool      xiscuda,yiscuda;

633:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
634:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
635:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
636:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE);
637:   return 0;
638: }

640: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
641: {
642:   PetscBool      xiscuda,yiscuda;

644:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
645:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
646:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
647:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE);
648:   return 0;
649: }

651: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
652: {
653:   PetscBool      xiscuda,ziscuda;

655:   VecCopy(y,z);
656:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
657:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
658:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
659:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE);
660:   return 0;
661: }

663: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
664: {
665:   PetscBool      xiscuda,ziscuda;

667:   VecCopy(y,z);
668:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
669:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
670:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
671:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE);
672:   return 0;
673: }

675: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
676: {
677:   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

679:   a->s *= s;
680:   return 0;
681: }

683: static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
684: {
685:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

687:   PetscOptionsHead(PetscOptionsObject,"H2OPUS options");
688:   PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL);
689:   PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
690:   PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
691:   PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
692:   PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
693:   PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL);
694:   PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
695:   PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
696:   PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL);
697:   PetscOptionsTail();
698:   return 0;
699: }

701: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);

703: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
704: {
705:   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
706:   Vec               c;
707:   PetscInt          spacedim;
708:   const PetscScalar *coords;

710:   if (a->ptcloud) return 0;
711:   PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c);
712:   if (!c && a->sampler) {
713:     Mat S = a->sampler->GetSamplingMat();

715:     PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c);
716:   }
717:   if (!c) {
718:     MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL);
719:   } else {
720:     VecGetArrayRead(c,&coords);
721:     VecGetBlockSize(c,&spacedim);
722:     MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL);
723:     VecRestoreArrayRead(c,&coords);
724:   }
725:   return 0;
726: }

728: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
729: {
730:   MPI_Comm       comm;
731:   PetscMPIInt    size;
732:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
733:   PetscInt       n = 0,*idx = NULL;
734:   int            *iidx = NULL;
735:   PetscCopyMode  own;
736:   PetscBool      rid;

738:   if (a->multsetup) return 0;
739:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
740:     PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
741: #if defined(PETSC_H2OPUS_USE_GPU)
742:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
743:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
744:     a->xxs_gpu = 1;
745:     a->yys_gpu = 1;
746: #endif
747:     a->xx  = new thrust::host_vector<PetscScalar>(n);
748:     a->yy  = new thrust::host_vector<PetscScalar>(n);
749:     a->xxs = 1;
750:     a->yys = 1;
751:   } else {
752:     IS is;
753:     PetscObjectGetComm((PetscObject)A,&comm);
754:     MPI_Comm_size(comm,&size);
755:     if (!a->h2opus_indexmap) {
756:       if (size > 1) {
758: #if defined(H2OPUS_USE_MPI)
759:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
760:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
761: #endif
762:       } else {
763:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
764:         n    = a->hmatrix->u_basis_tree.index_map.size();
765:       }

767:       if (PetscDefined(USE_64BIT_INDICES)) {
768:         PetscInt i;

770:         own  = PETSC_OWN_POINTER;
771:         PetscMalloc1(n,&idx);
772:         for (i=0;i<n;i++) idx[i] = iidx[i];
773:       } else {
774:         own  = PETSC_COPY_VALUES;
775:         idx  = (PetscInt*)iidx;
776:       }
777:       ISCreateGeneral(comm,n,idx,own,&is);
778:       ISSetPermutation(is);
779:       ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view");
780:       a->h2opus_indexmap = is;
781:     }
782:     ISGetLocalSize(a->h2opus_indexmap,&n);
783:     ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
784:     rid  = (PetscBool)(n == A->rmap->n);
785:     MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm);
786:     if (rid) {
787:       ISIdentity(a->h2opus_indexmap,&rid);
788:     }
789:     if (!rid) {
790:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
791:         PetscLayoutCreate(comm,&a->h2opus_rmap);
792:         PetscLayoutSetLocalSize(a->h2opus_rmap,n);
793:         PetscLayoutSetUp(a->h2opus_rmap);
794:         PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap);
795:       }
796:       PetscSFCreate(comm,&a->sf);
797:       PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
798:       PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view");
799: #if defined(PETSC_H2OPUS_USE_GPU)
800:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
801:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
802:       a->xxs_gpu = 1;
803:       a->yys_gpu = 1;
804: #endif
805:       a->xx  = new thrust::host_vector<PetscScalar>(n);
806:       a->yy  = new thrust::host_vector<PetscScalar>(n);
807:       a->xxs = 1;
808:       a->yys = 1;
809:     }
810:     ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
811:   }
812:   a->multsetup = PETSC_TRUE;
813:   return 0;
814: }

816: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
817: {
818:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
819: #if defined(H2OPUS_USE_MPI)
820:   h2opusHandle_t handle = a->handle->handle;
821: #else
822:   h2opusHandle_t handle = a->handle;
823: #endif
824:   PetscBool      kernel = PETSC_FALSE;
825:   PetscBool      boundtocpu = PETSC_TRUE;
826:   PetscBool      samplingdone = PETSC_FALSE;
827:   MPI_Comm       comm;
828:   PetscMPIInt    size;

830:   PetscObjectGetComm((PetscObject)A,&comm);

834:   /* XXX */
835:   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));

837:   MPI_Comm_size(comm,&size);
838:   /* TODO REUSABILITY of geometric construction */
839:   delete a->hmatrix;
840:   delete a->dist_hmatrix;
841: #if defined(PETSC_H2OPUS_USE_GPU)
842:   delete a->hmatrix_gpu;
843:   delete a->dist_hmatrix_gpu;
844: #endif
845:   a->orthogonal = PETSC_FALSE;

847:   /* TODO: other? */
848:   H2OpusBoxCenterAdmissibility adm(a->eta);

850:   PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0);
851:   if (size > 1) {
852: #if defined(H2OPUS_USE_MPI)
853:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
854: #else
855:     a->dist_hmatrix = NULL;
856: #endif
857:   } else {
858:     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
859:   }
860:   MatH2OpusInferCoordinates_Private(A);
862:   if (a->kernel) {
863:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
864:     if (size > 1) {
866: #if defined(H2OPUS_USE_MPI)
867:       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
868: #endif
869:     } else {
870:       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
871:     }
872:     kernel = PETSC_TRUE;
873:   } else {
875:     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
876:   }
877:   MatSetUpMultiply_H2OPUS(A);

879: #if defined(PETSC_H2OPUS_USE_GPU)
880:   boundtocpu = A->boundtocpu;
881:   if (!boundtocpu) {
882:     if (size > 1) {
884: #if defined(H2OPUS_USE_MPI)
885:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
886: #endif
887:     } else {
888:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
889:     }
890:   }
891: #endif
892:   if (size == 1) {
893:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
894:       PetscReal Anorm;
895:       bool      verbose;

897:       PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL);
898:       verbose = a->hara_verbose;
899:       MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm);
900:       if (a->hara_verbose) PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs);
901:       if (a->sf && !a->nativemult) {
902:         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
903:       }
904:       a->sampler->SetStream(handle->getMainStream());
905:       if (boundtocpu) {
906:         a->sampler->SetGPUSampling(false);
907:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
908: #if defined(PETSC_H2OPUS_USE_GPU)
909:       } else {
910:         a->sampler->SetGPUSampling(true);
911:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
912: #endif
913:       }
914:       samplingdone = PETSC_TRUE;
915:     }
916:   }
917: #if defined(PETSC_H2OPUS_USE_GPU)
918:   if (!boundtocpu) {
919:     delete a->hmatrix;
920:     delete a->dist_hmatrix;
921:     a->hmatrix = NULL;
922:     a->dist_hmatrix = NULL;
923:   }
924:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
925: #endif
926:   PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0);

928:   if (!a->s) a->s = 1.0;
929:   A->assembled = PETSC_TRUE;

931:   if (samplingdone) {
932:     PetscBool check = a->check_construction;
933:     PetscBool checke = PETSC_FALSE;

935:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL);
936:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL);
937:     if (check) {
938:       Mat       E,Ae;
939:       PetscReal n1,ni,n2;
940:       PetscReal n1A,niA,n2A;
941:       void      (*normfunc)(void);

943:       Ae   = a->sampler->GetSamplingMat();
944:       MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
945:       MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
946:       MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
947:       MatNorm(E,NORM_1,&n1);
948:       MatNorm(E,NORM_INFINITY,&ni);
949:       MatNorm(E,NORM_2,&n2);
950:       if (checke) {
951:         Mat eA,eE,eAe;

953:         MatComputeOperator(A,MATAIJ,&eA);
954:         MatComputeOperator(E,MATAIJ,&eE);
955:         MatComputeOperator(Ae,MATAIJ,&eAe);
956:         MatChop(eA,PETSC_SMALL);
957:         MatChop(eE,PETSC_SMALL);
958:         MatChop(eAe,PETSC_SMALL);
959:         PetscObjectSetName((PetscObject)eA,"H2Mat");
960:         MatView(eA,NULL);
961:         PetscObjectSetName((PetscObject)eAe,"S");
962:         MatView(eAe,NULL);
963:         PetscObjectSetName((PetscObject)eE,"H2Mat - S");
964:         MatView(eE,NULL);
965:         MatDestroy(&eA);
966:         MatDestroy(&eE);
967:         MatDestroy(&eAe);
968:       }

970:       MatGetOperation(Ae,MATOP_NORM,&normfunc);
971:       MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
972:       MatNorm(Ae,NORM_1,&n1A);
973:       MatNorm(Ae,NORM_INFINITY,&niA);
974:       MatNorm(Ae,NORM_2,&n2A);
975:       n1A  = PetscMax(n1A,PETSC_SMALL);
976:       n2A  = PetscMax(n2A,PETSC_SMALL);
977:       niA  = PetscMax(niA,PETSC_SMALL);
978:       MatSetOperation(Ae,MATOP_NORM,normfunc);
979:       PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
980:       MatDestroy(&E);
981:     }
982:     a->sampler->SetSamplingMat(NULL);
983:   }
984:   return 0;
985: }

987: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
988: {
989:   PetscMPIInt    size;
990:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

992:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
994:   else {
995:     a->hmatrix->clearData();
996: #if defined(PETSC_H2OPUS_USE_GPU)
997:     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
998: #endif
999:   }
1000:   return 0;
1001: }

1003: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1004: {
1005:   Mat            A;
1006:   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1007: #if defined(PETSC_H2OPUS_USE_GPU)
1008:   PetscBool      iscpu = PETSC_FALSE;
1009: #else
1010:   PetscBool      iscpu = PETSC_TRUE;
1011: #endif
1012:   MPI_Comm       comm;

1014:   PetscObjectGetComm((PetscObject)B,&comm);
1015:   MatCreate(comm,&A);
1016:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1017:   MatSetType(A,MATH2OPUS);
1018:   MatPropagateSymmetryOptions(B,A);
1019:   a = (Mat_H2OPUS*)A->data;

1021:   a->eta              = b->eta;
1022:   a->leafsize         = b->leafsize;
1023:   a->basisord         = b->basisord;
1024:   a->max_rank         = b->max_rank;
1025:   a->bs               = b->bs;
1026:   a->rtol             = b->rtol;
1027:   a->norm_max_samples = b->norm_max_samples;
1028:   if (op == MAT_COPY_VALUES) a->s = b->s;

1030:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1031:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1033: #if defined(H2OPUS_USE_MPI)
1034:   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1035: #if defined(PETSC_H2OPUS_USE_GPU)
1036:   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1037: #endif
1038: #endif
1039:   if (b->hmatrix) {
1040:     a->hmatrix = new HMatrix(*b->hmatrix);
1041:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1042:   }
1043: #if defined(PETSC_H2OPUS_USE_GPU)
1044:   if (b->hmatrix_gpu) {
1045:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1046:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1047:   }
1048: #endif
1049:   if (b->sf) {
1050:     PetscObjectReference((PetscObject)b->sf);
1051:     a->sf = b->sf;
1052:   }
1053:   if (b->h2opus_indexmap) {
1054:     PetscObjectReference((PetscObject)b->h2opus_indexmap);
1055:     a->h2opus_indexmap = b->h2opus_indexmap;
1056:   }

1058:   MatSetUp(A);
1059:   MatSetUpMultiply_H2OPUS(A);
1060:   if (op == MAT_COPY_VALUES) {
1061:     A->assembled = PETSC_TRUE;
1062:     a->orthogonal = b->orthogonal;
1063: #if defined(PETSC_H2OPUS_USE_GPU)
1064:     A->offloadmask = B->offloadmask;
1065: #endif
1066:   }
1067: #if defined(PETSC_H2OPUS_USE_GPU)
1068:   iscpu = B->boundtocpu;
1069: #endif
1070:   MatBindToCPU(A,iscpu);

1072:   *nA = A;
1073:   return 0;
1074: }

1076: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1077: {
1078:   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1079:   PetscBool         isascii;
1080:   PetscMPIInt       size;
1081:   PetscViewerFormat format;

1083:   PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1084:   PetscViewerGetFormat(view,&format);
1085:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1086:   if (isascii) {
1087:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1088:       if (size == 1) {
1089:         FILE *fp;
1090:         PetscViewerASCIIGetPointer(view,&fp);
1091:         dumpHMatrix(*h2opus->hmatrix,6,fp);
1092:       }
1093:     } else {
1094:       PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat");
1095:       PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1096:       PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta);
1097:       if (!h2opus->kernel) {
1098:         PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol);
1099:       } else {
1100:         PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord);
1101:       }
1102:       PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples);
1103:       if (size == 1) {
1104:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1105:         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1106: #if defined(PETSC_HAVE_CUDA)
1107:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1108:         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1109: #endif
1110:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1111: #if defined(PETSC_HAVE_CUDA)
1112:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1113: #endif
1114:       } else {
1115: #if defined(PETSC_HAVE_CUDA)
1116:         double      matrix_mem[4] = {0.,0.,0.,0.};
1117:         PetscMPIInt rsize = 4;
1118: #else
1119:         double      matrix_mem[2] = {0.,0.};
1120:         PetscMPIInt rsize = 2;
1121: #endif
1122: #if defined(H2OPUS_USE_MPI)
1123:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1124:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1125: #if defined(PETSC_HAVE_CUDA)
1126:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1127:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1128: #endif
1129: #endif
1130:         MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A));
1131:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1132: #if defined(PETSC_HAVE_CUDA)
1133:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1134: #endif
1135:       }
1136:     }
1137:   }
1138: #if 0
1139:   if (size == 1) {
1140:     char filename[256];
1141:     const char *name;

1143:     PetscObjectGetName((PetscObject)A,&name);
1144:     PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name);
1145:     outputEps(*h2opus->hmatrix,filename);
1146:   }
1147: #endif
1148:   return 0;
1149: }

1151: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1152: {
1153:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1154:   PetscReal      *gcoords;
1155:   PetscInt       N;
1156:   MPI_Comm       comm;
1157:   PetscMPIInt    size;
1158:   PetscBool      cong;

1160:   PetscLayoutSetUp(A->rmap);
1161:   PetscLayoutSetUp(A->cmap);
1162:   PetscObjectGetComm((PetscObject)A,&comm);
1163:   MatHasCongruentLayouts(A,&cong);
1165:   N    = A->rmap->N;
1166:   MPI_Comm_size(comm,&size);
1167:   if (spacedim > 0 && size > 1 && cdist) {
1168:     PetscSF      sf;
1169:     MPI_Datatype dtype;

1171:     MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1172:     MPI_Type_commit(&dtype);

1174:     PetscSFCreate(comm,&sf);
1175:     PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1176:     PetscMalloc1(spacedim*N,&gcoords);
1177:     PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1178:     PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1179:     PetscSFDestroy(&sf);
1180:     MPI_Type_free(&dtype);
1181:   } else gcoords = (PetscReal*)coords;

1183:   delete h2opus->ptcloud;
1184:   delete h2opus->kernel;
1185:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1186:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1187:   if (gcoords != coords) PetscFree(gcoords);
1188:   A->preallocated = PETSC_TRUE;
1189:   return 0;
1190: }

1192: #if defined(PETSC_H2OPUS_USE_GPU)
1193: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1194: {
1195:   PetscMPIInt    size;
1196:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1198:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1199:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1200:     if (size > 1) {
1202: #if defined(H2OPUS_USE_MPI)
1203:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1204:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1205: #endif
1206:     } else {
1208:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1209:       else *a->hmatrix = *a->hmatrix_gpu;
1210:     }
1211:     delete a->hmatrix_gpu;
1212:     delete a->dist_hmatrix_gpu;
1213:     a->hmatrix_gpu = NULL;
1214:     a->dist_hmatrix_gpu = NULL;
1215:     A->offloadmask = PETSC_OFFLOAD_CPU;
1216:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1217:     if (size > 1) {
1219: #if defined(H2OPUS_USE_MPI)
1220:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1221:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1222: #endif
1223:     } else {
1225:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1226:       else *a->hmatrix_gpu = *a->hmatrix;
1227:     }
1228:     delete a->hmatrix;
1229:     delete a->dist_hmatrix;
1230:     a->hmatrix = NULL;
1231:     a->dist_hmatrix = NULL;
1232:     A->offloadmask = PETSC_OFFLOAD_GPU;
1233:   }
1234:   PetscFree(A->defaultvectype);
1235:   if (!flg) {
1236:     PetscStrallocpy(VECCUDA,&A->defaultvectype);
1237:   } else {
1238:     PetscStrallocpy(VECSTANDARD,&A->defaultvectype);
1239:   }
1240:   A->boundtocpu = flg;
1241:   return 0;
1242: }
1243: #endif

1245: /*MC
1246:      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.

1248:    Options Database Keys:
1249: .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()

1251:    Notes:
1252:      H2Opus implements hierarchical matrices in the H^2 flavour.
1253:      It supports CPU or NVIDIA GPUs.
1254:      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1255:      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1256:      For details and additional references, see
1257:        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1258:      available at https://arxiv.org/abs/2109.05451.

1260:    Level: beginner

1262: .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1263: M*/
1264: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1265: {
1266:   Mat_H2OPUS     *a;
1267:   PetscMPIInt    size;

1269: #if defined(PETSC_H2OPUS_USE_GPU)
1270:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1271: #endif
1272:   PetscNewLog(A,&a);
1273:   A->data = (void*)a;

1275:   a->eta              = 0.9;
1276:   a->leafsize         = 32;
1277:   a->basisord         = 4;
1278:   a->max_rank         = 64;
1279:   a->bs               = 32;
1280:   a->rtol             = 1.e-4;
1281:   a->s                = 1.0;
1282:   a->norm_max_samples = 10;
1283: #if defined(H2OPUS_USE_MPI)
1284:   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1285: #else
1286:   h2opusCreateHandle(&a->handle);
1287: #endif
1288:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1289:   PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS);
1290:   PetscMemzero(A->ops,sizeof(struct _MatOps));

1292:   A->ops->destroy          = MatDestroy_H2OPUS;
1293:   A->ops->view             = MatView_H2OPUS;
1294:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1295:   A->ops->mult             = MatMult_H2OPUS;
1296:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1297:   A->ops->multadd          = MatMultAdd_H2OPUS;
1298:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1299:   A->ops->scale            = MatScale_H2OPUS;
1300:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1301:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1302:   A->ops->norm             = MatNorm_H2OPUS;
1303:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1304: #if defined(PETSC_H2OPUS_USE_GPU)
1305:   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1306: #endif

1308:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS);
1309:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS);
1310:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS);
1311:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS);
1312: #if defined(PETSC_H2OPUS_USE_GPU)
1313:   PetscFree(A->defaultvectype);
1314:   PetscStrallocpy(VECCUDA,&A->defaultvectype);
1315: #endif
1316:   return 0;
1317: }

1319: /*@C
1320:      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1322:    Input Parameter:
1323: .     A - the matrix

1325:    Level: intermediate

1327: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1328: @*/
1329: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1330: {
1331:   PetscBool      ish2opus;
1332:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1333:   PetscMPIInt    size;
1334:   PetscBool      boundtocpu = PETSC_TRUE;

1338:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1339:   if (!ish2opus) return 0;
1340:   if (a->orthogonal) return 0;
1341:   HLibProfile::clear();
1342:   PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0);
1343: #if defined(PETSC_H2OPUS_USE_GPU)
1344:   boundtocpu = A->boundtocpu;
1345: #endif
1346:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1347:   if (size > 1) {
1348:     if (boundtocpu) {
1350: #if defined(H2OPUS_USE_MPI)
1351:       distributed_horthog(*a->dist_hmatrix, a->handle);
1352: #endif
1353: #if defined(PETSC_H2OPUS_USE_GPU)
1354:       A->offloadmask = PETSC_OFFLOAD_CPU;
1355:     } else {
1357:       PetscLogGpuTimeBegin();
1358: #if defined(H2OPUS_USE_MPI)
1359:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1360: #endif
1361:       PetscLogGpuTimeEnd();
1362: #endif
1363:     }
1364:   } else {
1365: #if defined(H2OPUS_USE_MPI)
1366:     h2opusHandle_t handle = a->handle->handle;
1367: #else
1368:     h2opusHandle_t handle = a->handle;
1369: #endif
1370:     if (boundtocpu) {
1372:       horthog(*a->hmatrix, handle);
1373: #if defined(PETSC_H2OPUS_USE_GPU)
1374:       A->offloadmask = PETSC_OFFLOAD_CPU;
1375:     } else {
1377:       PetscLogGpuTimeBegin();
1378:       horthog(*a->hmatrix_gpu, handle);
1379:       PetscLogGpuTimeEnd();
1380: #endif
1381:     }
1382:   }
1383:   a->orthogonal = PETSC_TRUE;
1384:   { /* log flops */
1385:     double gops,time,perf,dev;
1386:     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1387: #if defined(PETSC_H2OPUS_USE_GPU)
1388:     if (boundtocpu) {
1389:       PetscLogFlops(1e9*gops);
1390:     } else {
1391:       PetscLogGpuFlops(1e9*gops);
1392:     }
1393: #else
1394:     PetscLogFlops(1e9*gops);
1395: #endif
1396:   }
1397:   PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0);
1398:   return 0;
1399: }

1401: /*@C
1402:      MatH2OpusCompress - Compress a hierarchical matrix.

1404:    Input Parameters:
1405: +     A - the matrix
1406: -     tol - the absolute truncation threshold

1408:    Level: intermediate

1410: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1411: @*/
1412: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1413: {
1414:   PetscBool      ish2opus;
1415:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1416:   PetscMPIInt    size;
1417:   PetscBool      boundtocpu = PETSC_TRUE;

1422:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1423:   if (!ish2opus || tol <= 0.0) return 0;
1424:   MatH2OpusOrthogonalize(A);
1425:   HLibProfile::clear();
1426:   PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0);
1427: #if defined(PETSC_H2OPUS_USE_GPU)
1428:   boundtocpu = A->boundtocpu;
1429: #endif
1430:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1431:   if (size > 1) {
1432:     if (boundtocpu) {
1434: #if defined(H2OPUS_USE_MPI)
1435:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1436: #endif
1437: #if defined(PETSC_H2OPUS_USE_GPU)
1438:       A->offloadmask = PETSC_OFFLOAD_CPU;
1439:     } else {
1441:       PetscLogGpuTimeBegin();
1442: #if defined(H2OPUS_USE_MPI)
1443:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1444: #endif
1445:       PetscLogGpuTimeEnd();
1446: #endif
1447:     }
1448:   } else {
1449: #if defined(H2OPUS_USE_MPI)
1450:     h2opusHandle_t handle = a->handle->handle;
1451: #else
1452:     h2opusHandle_t handle = a->handle;
1453: #endif
1454:     if (boundtocpu) {
1456:       hcompress(*a->hmatrix, tol, handle);
1457: #if defined(PETSC_H2OPUS_USE_GPU)
1458:       A->offloadmask = PETSC_OFFLOAD_CPU;
1459:     } else {
1461:       PetscLogGpuTimeBegin();
1462:       hcompress(*a->hmatrix_gpu, tol, handle);
1463:       PetscLogGpuTimeEnd();
1464: #endif
1465:     }
1466:   }
1467:   { /* log flops */
1468:     double gops,time,perf,dev;
1469:     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1470: #if defined(PETSC_H2OPUS_USE_GPU)
1471:     if (boundtocpu) {
1472:       PetscLogFlops(1e9*gops);
1473:     } else {
1474:       PetscLogGpuFlops(1e9*gops);
1475:     }
1476: #else
1477:     PetscLogFlops(1e9*gops);
1478: #endif
1479:   }
1480:   PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0);
1481:   return 0;
1482: }

1484: /*@C
1485:      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.

1487:    Input Parameters:
1488: +     A - the hierarchical matrix
1489: .     B - the matrix to be sampled
1490: .     bs - maximum number of samples to be taken concurrently
1491: -     tol - relative tolerance for construction

1493:    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.

1495:    Level: intermediate

1497: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1498: @*/
1499: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1500: {
1501:   PetscBool      ish2opus;

1508:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1509:   if (ish2opus) {
1510:     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

1512:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1513:     a->sampler->SetSamplingMat(B);
1514:     if (bs > 0) a->bs = bs;
1515:     if (tol > 0.) a->rtol = tol;
1516:     delete a->kernel;
1517:   }
1518:   return 0;
1519: }

1521: /*@C
1522:      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.

1524:    Input Parameters:
1525: +     comm - MPI communicator
1526: .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1527: .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1528: .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1529: .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1530: .     spacedim - dimension of the space coordinates
1531: .     coords - coordinates of the points
1532: .     cdist - whether or not coordinates are distributed
1533: .     kernel - computational kernel (or NULL)
1534: .     kernelctx - kernel context
1535: .     eta - admissibility condition tolerance
1536: .     leafsize - leaf size in cluster tree
1537: -     basisord - approximation order for Chebychev interpolation of low-rank blocks

1539:    Output Parameter:
1540: .     nA - matrix

1542:    Options Database Keys:
1543: +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1544: .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1545: .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1546: -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms

1548:    Level: intermediate

1550: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1551: @*/
1552: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1553: {
1554:   Mat            A;
1555:   Mat_H2OPUS     *h2opus;
1556: #if defined(PETSC_H2OPUS_USE_GPU)
1557:   PetscBool      iscpu = PETSC_FALSE;
1558: #else
1559:   PetscBool      iscpu = PETSC_TRUE;
1560: #endif

1563:   MatCreate(comm,&A);
1564:   MatSetSizes(A,m,n,M,N);
1566:   MatSetType(A,MATH2OPUS);
1567:   MatBindToCPU(A,iscpu);
1568:   MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx);

1570:   h2opus = (Mat_H2OPUS*)A->data;
1571:   if (eta > 0.) h2opus->eta = eta;
1572:   if (leafsize > 0) h2opus->leafsize = leafsize;
1573:   if (basisord > 0) h2opus->basisord = basisord;

1575:   *nA = A;
1576:   return 0;
1577: }

1579: /*@C
1580:      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.

1582:    Input Parameters:
1583: +     B - the matrix to be sampled
1584: .     spacedim - dimension of the space coordinates
1585: .     coords - coordinates of the points
1586: .     cdist - whether or not coordinates are distributed
1587: .     eta - admissibility condition tolerance
1588: .     leafsize - leaf size in cluster tree
1589: .     maxrank - maximum rank allowed
1590: .     bs - maximum number of samples to be taken concurrently
1591: -     rtol - relative tolerance for construction

1593:    Output Parameter:
1594: .     nA - matrix

1596:    Options Database Keys:
1597: +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1598: .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1599: .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1600: .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1601: .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1602: .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1603: .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1604: -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms

1606:    Notes: not available in parallel

1608:    Level: intermediate

1610: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1611: @*/
1612: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1613: {
1614:   Mat            A;
1615:   Mat_H2OPUS     *h2opus;
1616:   MPI_Comm       comm;
1617:   PetscBool      boundtocpu = PETSC_TRUE;

1628:   PetscObjectGetComm((PetscObject)B,&comm);
1631:   MatCreate(comm,&A);
1632:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1633: #if defined(PETSC_H2OPUS_USE_GPU)
1634:   {
1635:     PetscBool iscuda;
1636:     VecType   vtype;

1638:     MatGetVecType(B,&vtype);
1639:     PetscStrcmp(vtype,VECCUDA,&iscuda);
1640:     if (!iscuda) {
1641:       PetscStrcmp(vtype,VECSEQCUDA,&iscuda);
1642:       if (!iscuda) {
1643:         PetscStrcmp(vtype,VECMPICUDA,&iscuda);
1644:       }
1645:     }
1646:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1647:   }
1648: #endif
1649:   MatSetType(A,MATH2OPUS);
1650:   MatBindToCPU(A,boundtocpu);
1651:   if (spacedim) {
1652:     MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL);
1653:   }
1654:   MatPropagateSymmetryOptions(B,A);

1657:   h2opus = (Mat_H2OPUS*)A->data;
1658:   h2opus->sampler = new PetscMatrixSampler(B);
1659:   if (eta > 0.) h2opus->eta = eta;
1660:   if (leafsize > 0) h2opus->leafsize = leafsize;
1661:   if (maxrank > 0) h2opus->max_rank = maxrank;
1662:   if (bs > 0) h2opus->bs = bs;
1663:   if (rtol > 0.) h2opus->rtol = rtol;
1664:   *nA = A;
1665:   A->preallocated = PETSC_TRUE;
1666:   return 0;
1667: }

1669: /*@C
1670:      MatH2OpusGetIndexMap - Access reordering index set.

1672:    Input Parameters:
1673: .     A - the matrix

1675:    Output Parameter:
1676: .     indexmap - the index set for the reordering

1678:    Level: intermediate

1680: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1681: @*/
1682: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1683: {
1684:   PetscBool      ish2opus;
1685:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1691:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1693:   *indexmap = a->h2opus_indexmap;
1694:   return 0;
1695: }

1697: /*@C
1698:      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1700:    Input Parameters:
1701: +     A - the matrix
1702: .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1703: -     in - the vector to be mapped

1705:    Output Parameter:
1706: .     out - the newly created mapped vector

1708:    Level: intermediate

1710: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1711: @*/
1712: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1713: {
1714:   PetscBool      ish2opus;
1715:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1716:   PetscScalar    *xin,*xout;
1717:   PetscBool      nm;

1725:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1727:   nm   = a->nativemult;
1728:   MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc);
1729:   MatCreateVecs(A,out,NULL);
1730:   MatH2OpusSetNativeMult(A,nm);
1731:   if (!a->sf) { /* same ordering */
1732:     VecCopy(in,*out);
1733:     return 0;
1734:   }
1735:   VecGetArrayRead(in,(const PetscScalar**)&xin);
1736:   VecGetArrayWrite(*out,&xout);
1737:   if (nativetopetsc) {
1738:     PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1739:     PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1740:   } else {
1741:     PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1742:     PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1743:   }
1744:   VecRestoreArrayRead(in,(const PetscScalar**)&xin);
1745:   VecRestoreArrayWrite(*out,&xout);
1746:   return 0;
1747: }

1749: /*@C
1750:      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T

1752:    Input Parameters:
1753: +     A - the hierarchical matrix
1754: .     s - the scaling factor
1755: .     U - the dense low-rank update matrix
1756: -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)

1758:    Notes: The U and V matrices must be in dense format

1760:    Level: intermediate

1762: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE
1763: @*/
1764: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1765: {
1766:   PetscBool      flg;

1773:   if (V) {
1776:   }

1779:   if (!V) V = U;
1781:   if (!U->cmap->N) return 0;
1782:   PetscLayoutCompare(U->rmap,A->rmap,&flg);
1784:   PetscLayoutCompare(V->rmap,A->cmap,&flg);
1786:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg);
1787:   if (flg) {
1788:     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1789:     const PetscScalar *u,*v,*uu,*vv;
1790:     PetscInt          ldu,ldv;
1791:     PetscMPIInt       size;
1792: #if defined(H2OPUS_USE_MPI)
1793:     h2opusHandle_t    handle = a->handle->handle;
1794: #else
1795:     h2opusHandle_t    handle = a->handle;
1796: #endif
1797:     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1798:     PetscSF           usf,vsf;

1800:     MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1802:     PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0);
1803:     PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"");
1805:     PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"");
1807:     MatDenseGetLDA(U,&ldu);
1808:     MatDenseGetLDA(V,&ldv);
1809:     MatBoundToCPU(A,&flg);
1810:     if (usesf) {
1811:       PetscInt n;

1813:       MatDenseGetH2OpusVectorSF(U,a->sf,&usf);
1814:       MatDenseGetH2OpusVectorSF(V,a->sf,&vsf);
1815:       MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N);
1816:       PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
1817:       ldu = n;
1818:       ldv = n;
1819:     }
1820:     if (flg) {
1822:       MatDenseGetArrayRead(U,&u);
1823:       MatDenseGetArrayRead(V,&v);
1824:       if (usesf) {
1825:         vv = MatH2OpusGetThrustPointer(*a->yy);
1826:         PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1827:         PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1828:         if (U != V) {
1829:           uu = MatH2OpusGetThrustPointer(*a->xx);
1830:           PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1831:           PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1832:         } else uu = vv;
1833:       } else { uu = u; vv = v; }
1834:       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1835:       MatDenseRestoreArrayRead(U,&u);
1836:       MatDenseRestoreArrayRead(V,&v);
1837:     } else {
1838: #if defined(PETSC_H2OPUS_USE_GPU)
1839:       PetscBool flgU, flgV;

1842:       PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"");
1843:       if (flgU) MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U);
1844:       PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"");
1845:       if (flgV) MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V);
1846:       MatDenseCUDAGetArrayRead(U,&u);
1847:       MatDenseCUDAGetArrayRead(V,&v);
1848:       if (usesf) {
1849:         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1850:         PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1851:         PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1852:         if (U != V) {
1853:           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1854:           PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1855:           PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1856:         } else uu = vv;
1857:       } else { uu = u; vv = v; }
1858: #else
1859:       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1860: #endif
1861:       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1862: #if defined(PETSC_H2OPUS_USE_GPU)
1863:       MatDenseCUDARestoreArrayRead(U,&u);
1864:       MatDenseCUDARestoreArrayRead(V,&v);
1865:       if (flgU) MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U);
1866:       if (flgV) MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V);
1867: #endif
1868:     }
1869:     PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0);
1870:     a->orthogonal = PETSC_FALSE;
1871:   }
1872:   return 0;
1873: }
1874: #endif