Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4: #include <h2opus.h>
  5: #if defined(H2OPUS_USE_MPI)
  6: #include <h2opus/distributed/distributed_h2opus_handle.h>
  7: #include <h2opus/distributed/distributed_geometric_construction.h>
  8: #include <h2opus/distributed/distributed_hgemv.h>
  9: #include <h2opus/distributed/distributed_horthog.h>
 10: #include <h2opus/distributed/distributed_hcompress.h>
 11: #endif
 12: #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petscsf.h>

 17: /* math2opusutils */
 18: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
 19: PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
 20: PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
 21: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);

 23: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 25: /* Use GPU only if H2OPUS is configured for GPU */
 26: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 27: #define PETSC_H2OPUS_USE_GPU
 28: #endif
 29: #if defined(PETSC_H2OPUS_USE_GPU)
 30: #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
 31: #else
 32: #define MatH2OpusUpdateIfNeeded(A,B) 0
 33: #endif

 35: // TODO H2OPUS:
 36: // DistributedHMatrix
 37: //   unsymmetric ?
 38: //   transpose for distributed_hgemv?
 39: //   clearData()
 40: // Unify interface for sequential and parallel?
 41: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 42: //
 43: template <class T> class PetscPointCloud : public H2OpusDataSet<T>
 44: {
 45:   private:
 46:     int dimension;
 47:     size_t num_points;
 48:     std::vector<T> pts;

 50:   public:
 51:     PetscPointCloud(int dim, size_t num_pts, const T coords[])
 52:     {
 53:       this->dimension = dim;
 54:       this->num_points = num_pts;

 56:       pts.resize(num_pts*dim);
 57:       if (coords) {
 58:         for (size_t n = 0; n < num_points; n++)
 59:           for (int i = 0; i < dim; i++)
 60:             pts[n*dim + i] = coords[n*dim + i];
 61:       } else {
 62:         PetscReal h = 1./(num_points - 1);
 63:         for (size_t n = 0; n < num_points; n++)
 64:           for (int i = 0; i < dim; i++)
 65:             pts[n*dim + i] = i*h;
 66:       }
 67:     }

 69:     PetscPointCloud(const PetscPointCloud<T>& other)
 70:     {
 71:       size_t N = other.dimension * other.num_points;
 72:       this->dimension = other.dimension;
 73:       this->num_points = other.num_points;
 74:       this->pts.resize(N);
 75:       for (size_t i = 0; i < N; i++)
 76:         this->pts[i] = other.pts[i];
 77:     }

 79:     int getDimension() const
 80:     {
 81:         return dimension;
 82:     }

 84:     size_t getDataSetSize() const
 85:     {
 86:         return num_points;
 87:     }

 89:     T getDataPoint(size_t idx, int dim) const
 90:     {
 91:         assert(dim < dimension && idx < num_points);
 92:         return pts[idx*dimension + dim];
 93:     }

 95:     void Print(std::ostream& out = std::cout)
 96:     {
 97:        out << "Dimension: " << dimension << std::endl;
 98:        out << "NumPoints: " << num_points << std::endl;
 99:        for (size_t n = 0; n < num_points; n++) {
100:          for (int d = 0; d < dimension; d++)
101:            out << pts[n*dimension + d] << " ";
102:          out << std::endl;
103:        }
104:     }
105: };

107: template<class T> class PetscFunctionGenerator
108: {
109: private:
110:   MatH2OpusKernel k;
111:   int             dim;
112:   void            *ctx;

114: public:
115:     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
116:     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
117:     T operator()(PetscReal *pt1, PetscReal *pt2)
118:     {
119:         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
120:     }
121: };

123: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

125: /* just to not clutter the code */
126: #if !defined(H2OPUS_USE_GPU)
127: typedef HMatrix HMatrix_GPU;
128: #if defined(H2OPUS_USE_MPI)
129: typedef DistributedHMatrix DistributedHMatrix_GPU;
130: #endif
131: #endif

133: typedef struct {
134: #if defined(H2OPUS_USE_MPI)
135:   distributedH2OpusHandle_t handle;
136: #else
137:   h2opusHandle_t handle;
138: #endif
139:   /* Sequential and parallel matrices are two different classes at the moment */
140:   HMatrix *hmatrix;
141: #if defined(H2OPUS_USE_MPI)
142:   DistributedHMatrix *dist_hmatrix;
143: #else
144:   HMatrix *dist_hmatrix; /* just to not clutter the code */
145: #endif
146:   /* May use permutations */
147:   PetscSF sf;
148:   PetscLayout h2opus_rmap, h2opus_cmap;
149:   IS h2opus_indexmap;
150:   thrust::host_vector<PetscScalar> *xx,*yy;
151:   PetscInt xxs,yys;
152:   PetscBool multsetup;

154:   /* GPU */
155:   HMatrix_GPU *hmatrix_gpu;
156: #if defined(H2OPUS_USE_MPI)
157:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
158: #else
159:   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
160: #endif
161: #if defined(PETSC_H2OPUS_USE_GPU)
162:   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
163:   PetscInt xxs_gpu,yys_gpu;
164: #endif

166:   /* construction from matvecs */
167:   PetscMatrixSampler* sampler;
168:   PetscBool nativemult;

170:   /* Admissibility */
171:   PetscReal eta;
172:   PetscInt  leafsize;

174:   /* for dof reordering */
175:   PetscPointCloud<PetscReal> *ptcloud;

177:   /* kernel for generating matrix entries */
178:   PetscFunctionGenerator<PetscScalar> *kernel;

180:   /* basis orthogonalized? */
181:   PetscBool orthogonal;

183:   /* customization */
184:   PetscInt  basisord;
185:   PetscInt  max_rank;
186:   PetscInt  bs;
187:   PetscReal rtol;
188:   PetscInt  norm_max_samples;
189:   PetscBool check_construction;
190:   PetscBool hara_verbose;

192:   /* keeps track of MatScale values */
193:   PetscScalar s;
194: } Mat_H2OPUS;

196: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
197: {
198:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

202: #if defined(H2OPUS_USE_MPI)
203:   h2opusDestroyDistributedHandle(a->handle);
204: #else
205:   h2opusDestroyHandle(a->handle);
206: #endif
207:   delete a->dist_hmatrix;
208:   delete a->hmatrix;
209:   PetscSFDestroy(&a->sf);
210:   PetscLayoutDestroy(&a->h2opus_rmap);
211:   PetscLayoutDestroy(&a->h2opus_cmap);
212:   ISDestroy(&a->h2opus_indexmap);
213:   delete a->xx;
214:   delete a->yy;
215:   delete a->hmatrix_gpu;
216:   delete a->dist_hmatrix_gpu;
217: #if defined(PETSC_H2OPUS_USE_GPU)
218:   delete a->xx_gpu;
219:   delete a->yy_gpu;
220: #endif
221:   delete a->sampler;
222:   delete a->ptcloud;
223:   delete a->kernel;
224:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL);
225:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL);
226:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL);
227:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL);
228:   PetscObjectChangeTypeName((PetscObject)A,NULL);
229:   PetscFree(A->data);
230:   return(0);
231: }

233: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
234: {
235:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
236:   PetscBool      ish2opus;

242:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
243:   if (ish2opus) {
244:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246:         PetscLayout t;
247:         t = A->rmap;
248:         A->rmap = a->h2opus_rmap;
249:         a->h2opus_rmap = t;
250:         t = A->cmap;
251:         A->cmap = a->h2opus_cmap;
252:         a->h2opus_cmap = t;
253:       }
254:     }
255:     a->nativemult = nm;
256:   }
257:   return(0);
258: }

260: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261: {
262:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
263:   PetscBool      ish2opus;

269:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
270:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
271:   *nm = a->nativemult;
272:   return(0);
273: }

275: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
276: {
278:   PetscBool      ish2opus;
279:   PetscInt       nmax = PETSC_DECIDE;
280:   Mat_H2OPUS     *a = NULL;
281:   PetscBool      mult = PETSC_FALSE;

284:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
285:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
286:     a = (Mat_H2OPUS*)A->data;

288:     nmax = a->norm_max_samples;
289:     mult = a->nativemult;
290:     MatH2OpusSetNativeMult(A,PETSC_TRUE);
291:   } else {
292:     PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL);
293:   }
294:   MatApproximateNorm_Private(A,normtype,nmax,n);
295:   if (a) { MatH2OpusSetNativeMult(A,mult); }
296:   return(0);
297: }

299: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
300: {
301:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
302: #if defined(H2OPUS_USE_MPI)
303:   h2opusHandle_t handle = h2opus->handle->handle;
304: #else
305:   h2opusHandle_t handle = h2opus->handle;
306: #endif
307:   PetscBool      boundtocpu = PETSC_TRUE;
308:   PetscScalar    *xx,*yy,*uxx,*uyy;
309:   PetscInt       blda,clda;
310:   PetscMPIInt    size;
311:   PetscSF        bsf,csf;
312:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

316:   HLibProfile::clear();
317: #if defined(PETSC_H2OPUS_USE_GPU)
318:   boundtocpu = A->boundtocpu;
319: #endif
320:   MatDenseGetLDA(B,&blda);
321:   MatDenseGetLDA(C,&clda);
322:   if (usesf) {
323:     PetscInt n;

325:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
326:     PetscObjectQuery((PetscObject)B,"_math2opus_vectorsf",(PetscObject*)&bsf);
327:     if (!bsf) {
328:       PetscSFGetVectorSF(h2opus->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);
329:       PetscObjectCompose((PetscObject)B,"_math2opus_vectorsf",(PetscObject)bsf);
330:       PetscObjectDereference((PetscObject)bsf);
331:     }
332:     PetscObjectQuery((PetscObject)C,"_math2opus_vectorsf",(PetscObject*)&csf);
333:     if (!csf) {
334:       PetscSFGetVectorSF(h2opus->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);
335:       PetscObjectCompose((PetscObject)C,"_math2opus_vectorsf",(PetscObject)csf);
336:       PetscObjectDereference((PetscObject)csf);
337:     }
338:     blda = n;
339:     clda = n;
340:   }
341:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
342:   if (boundtocpu) {
343:     if (usesf) {
344:       PetscInt n;

346:       PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
347:       if (h2opus->xxs < B->cmap->n) { h2opus->xx->resize(n*B->cmap->N); h2opus->xxs = B->cmap->N; }
348:       if (h2opus->yys < B->cmap->n) { h2opus->yy->resize(n*B->cmap->N); h2opus->yys = B->cmap->N; }
349:     }
350:     MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
351:     MatDenseGetArrayWrite(C,&yy);
352:     if (usesf) {
353:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
354:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
355:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
356:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
357:     } else {
358:       uxx = xx;
359:       uyy = yy;
360:     }
361:     if (size > 1) {
362:       if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
363:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
364: #if defined(H2OPUS_USE_MPI)
365:       distributed_hgemv(/*transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
366: #endif
367:     } else {
368:       if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
369:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
370:     }
371:     MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
372:     if (usesf) {
373:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
374:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
375:     }
376:     MatDenseRestoreArrayWrite(C,&yy);
377: #if defined(PETSC_H2OPUS_USE_GPU)
378:   } else {
379:     PetscBool ciscuda,biscuda;

381:     if (usesf) {
382:       PetscInt n;

384:       PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
385:       if (h2opus->xxs_gpu < B->cmap->n) { h2opus->xx_gpu->resize(n*B->cmap->N); h2opus->xxs_gpu = B->cmap->N; }
386:       if (h2opus->yys_gpu < B->cmap->n) { h2opus->yy_gpu->resize(n*B->cmap->N); h2opus->yys_gpu = B->cmap->N; }
387:     }
388:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
389:     PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
390:     if (!biscuda) {
391:       MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
392:     }
393:     PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
394:     if (!ciscuda) {
395:       C->assembled = PETSC_TRUE;
396:       MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
397:     }
398:     MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
399:     MatDenseCUDAGetArrayWrite(C,&yy);
400:     if (usesf) {
401:       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
402:       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
403:       PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
404:       PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
405:     } else {
406:       uxx = xx;
407:       uyy = yy;
408:     }
409:     PetscLogGpuTimeBegin();
410:     if (size > 1) {
411:       if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
412:       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
413: #if defined(H2OPUS_USE_MPI)
414:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
415: #endif
416:     } else {
417:       if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
418:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
419:     }
420:     PetscLogGpuTimeEnd();
421:     MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
422:     if (usesf) {
423:       PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
424:       PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
425:     }
426:     MatDenseCUDARestoreArrayWrite(C,&yy);
427:     if (!biscuda) {
428:       MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
429:     }
430:     if (!ciscuda) {
431:       MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
432:     }
433: #endif
434:   }
435:   { /* log flops */
436:     double gops,time,perf,dev;
437:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
438: #if defined(PETSC_H2OPUS_USE_GPU)
439:     if (boundtocpu) {
440:       PetscLogFlops(1e9*gops);
441:     } else {
442:       PetscLogGpuFlops(1e9*gops);
443:     }
444: #else
445:     PetscLogFlops(1e9*gops);
446: #endif
447:   }
448:   return(0);
449: }

451: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452: {
453:   Mat_Product    *product = C->product;

457:   MatCheckProduct(C,1);
458:   switch (product->type) {
459:   case MATPRODUCT_AB:
460:     MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C);
461:     break;
462:   case MATPRODUCT_AtB:
463:     MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C);
464:     break;
465:   default:
466:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
467:   }
468:   return(0);
469: }

471: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
472: {
474:   Mat_Product    *product = C->product;
475:   PetscBool      cisdense;
476:   Mat            A,B;

479:   MatCheckProduct(C,1);
480:   A = product->A;
481:   B = product->B;
482:   switch (product->type) {
483:   case MATPRODUCT_AB:
484:     MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
485:     MatSetBlockSizesFromMats(C,product->A,product->B);
486:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
487:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
488:     MatSetUp(C);
489:     break;
490:   case MATPRODUCT_AtB:
491:     MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
492:     MatSetBlockSizesFromMats(C,product->A,product->B);
493:     PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
494:     if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
495:     MatSetUp(C);
496:     break;
497:   default:
498:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
499:   }
500:   C->ops->productsymbolic = NULL;
501:   C->ops->productnumeric = MatProductNumeric_H2OPUS;
502:   return(0);
503: }

505: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
506: {
508:   MatCheckProduct(C,1);
509:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
510:     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
511:   }
512:   return(0);
513: }

515: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
516: {
517:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
518: #if defined(H2OPUS_USE_MPI)
519:   h2opusHandle_t handle = h2opus->handle->handle;
520: #else
521:   h2opusHandle_t handle = h2opus->handle;
522: #endif
523:   PetscBool      boundtocpu = PETSC_TRUE;
524:   PetscInt       n;
525:   PetscScalar    *xx,*yy,*uxx,*uyy;
526:   PetscMPIInt    size;
527:   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

531:   HLibProfile::clear();
532:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
533: #if defined(PETSC_H2OPUS_USE_GPU)
534:   boundtocpu = A->boundtocpu;
535: #endif
536:   if (usesf) {
537:     PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
538:   } else n = A->rmap->n;
539:   if (boundtocpu) {
540:     VecGetArrayRead(x,(const PetscScalar**)&xx);
541:     if (sy == 0.0) {
542:       VecGetArrayWrite(y,&yy);
543:     } else {
544:       VecGetArray(y,&yy);
545:     }
546:     if (usesf) {
547:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

550:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
551:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
552:       if (sy != 0.0) {
553:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
554:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
555:       }
556:     } else {
557:       uxx = xx;
558:       uyy = yy;
559:     }
560:     if (size > 1) {
561:       if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563: #if defined(H2OPUS_USE_MPI)
564:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565: #endif
566:     } else {
567:       if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569:     }
570:     VecRestoreArrayRead(x,(const PetscScalar**)&xx);
571:     if (usesf) {
572:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
573:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
574:     }
575:     if (sy == 0.0) {
576:       VecRestoreArrayWrite(y,&yy);
577:     } else {
578:       VecRestoreArray(y,&yy);
579:     }
580: #if defined(PETSC_H2OPUS_USE_GPU)
581:   } else {
582:     VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
583:     if (sy == 0.0) {
584:       VecCUDAGetArrayWrite(y,&yy);
585:     } else {
586:       VecCUDAGetArray(y,&yy);
587:     }
588:     if (usesf) {
589:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

592:       PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
593:       PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
594:       if (sy != 0.0) {
595:         PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
596:         PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
597:       }
598:     } else {
599:       uxx = xx;
600:       uyy = yy;
601:     }
602:     PetscLogGpuTimeBegin();
603:     if (size > 1) {
604:       if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605:       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606: #if defined(H2OPUS_USE_MPI)
607:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608: #endif
609:     } else {
610:       if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612:     }
613:     PetscLogGpuTimeEnd();
614:     VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
615:     if (usesf) {
616:       PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
617:       PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
618:     }
619:     if (sy == 0.0) {
620:       VecCUDARestoreArrayWrite(y,&yy);
621:     } else {
622:       VecCUDARestoreArray(y,&yy);
623:     }
624: #endif
625:   }
626:   { /* log flops */
627:     double gops,time,perf,dev;
628:     HLibProfile::getHgemvPerf(gops,time,perf,dev);
629: #if defined(PETSC_H2OPUS_USE_GPU)
630:     if (boundtocpu) {
631:       PetscLogFlops(1e9*gops);
632:     } else {
633:       PetscLogGpuFlops(1e9*gops);
634:     }
635: #else
636:     PetscLogFlops(1e9*gops);
637: #endif
638:   }
639:   return(0);
640: }

642: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643: {
644:   PetscBool      xiscuda,yiscuda;

648:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
649:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
650:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
651:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE);
652:   return(0);
653: }

655: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656: {
657:   PetscBool      xiscuda,yiscuda;

661:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
662:   PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
663:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
664:   MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE);
665:   return(0);
666: }

668: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
669: {
670:   PetscBool      xiscuda,ziscuda;

674:   VecCopy(y,z);
675:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
676:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
677:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
678:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE);
679:   return(0);
680: }

682: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
683: {
684:   PetscBool      xiscuda,ziscuda;

688:   VecCopy(y,z);
689:   PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
690:   PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
691:   MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
692:   MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE);
693:   return(0);
694: }

696: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
697: {
698:   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

701:   a->s *= s;
702:   return(0);
703: }

705: static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
706: {
707:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

711:   PetscOptionsHead(PetscOptionsObject,"H2OPUS options");
712:   PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL);
713:   PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
714:   PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
715:   PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
716:   PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
717:   PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL);
718:   PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
719:   PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
720:   PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL);
721:   PetscOptionsTail();
722:   return(0);
723: }

725: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);

727: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
728: {
729:   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
730:   Vec               c;
731:   PetscInt          spacedim;
732:   const PetscScalar *coords;
733:   PetscErrorCode    ierr;

736:   if (a->ptcloud) return(0);
737:   PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c);
738:   if (!c && a->sampler) {
739:     Mat S = a->sampler->GetSamplingMat();

741:     PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c);
742:   }
743:   if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
744:   VecGetArrayRead(c,&coords);
745:   VecGetBlockSize(c,&spacedim);
746:   MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL);
747:   VecRestoreArrayRead(c,&coords);
748:   return(0);
749: }

751: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
752: {
753:   MPI_Comm       comm;
754:   PetscMPIInt    size;
756:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
757:   PetscInt       n = 0,*idx = NULL;
758:   int            *iidx = NULL;
759:   PetscCopyMode  own;
760:   PetscBool      rid;

763:   if (a->multsetup) return(0);
764:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
765:     PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
766: #if defined(PETSC_H2OPUS_USE_GPU)
767:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
768:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
769:     a->xxs_gpu = 1;
770:     a->yys_gpu = 1;
771: #endif
772:     a->xx  = new thrust::host_vector<PetscScalar>(n);
773:     a->yy  = new thrust::host_vector<PetscScalar>(n);
774:     a->xxs = 1;
775:     a->yys = 1;
776:   } else {
777:     IS is;
778:     PetscObjectGetComm((PetscObject)A,&comm);
779:     MPI_Comm_size(comm,&size);
780:     if (!a->h2opus_indexmap) {
781:       if (size > 1) {
782:         if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
783: #if defined(H2OPUS_USE_MPI)
784:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
785:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
786: #endif
787:       } else {
788:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
789:         n    = a->hmatrix->u_basis_tree.index_map.size();
790:       }

792:       if (PetscDefined(USE_64BIT_INDICES)) {
793:         PetscInt i;

795:         own  = PETSC_OWN_POINTER;
796:         PetscMalloc1(n,&idx);
797:         for (i=0;i<n;i++) idx[i] = iidx[i];
798:       } else {
799:         own  = PETSC_COPY_VALUES;
800:         idx  = (PetscInt*)iidx;
801:       }
802:       ISCreateGeneral(comm,n,idx,own,&is);
803:       ISSetPermutation(is);
804:       ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view");
805:       a->h2opus_indexmap = is;
806:     }
807:     ISGetLocalSize(a->h2opus_indexmap,&n);
808:     ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
809:     rid  = (PetscBool)(n == A->rmap->n);
810:     MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm);
811:     if (rid) {
812:       ISIdentity(a->h2opus_indexmap,&rid);
813:     }
814:     if (!rid) {
815:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
816:         PetscLayoutCreate(comm,&a->h2opus_rmap);
817:         PetscLayoutSetLocalSize(a->h2opus_rmap,n);
818:         PetscLayoutSetUp(a->h2opus_rmap);
819:         PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap);
820:       }
821:       PetscSFCreate(comm,&a->sf);
822:       PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
823:       PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view");
824: #if defined(PETSC_H2OPUS_USE_GPU)
825:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
826:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
827:       a->xxs_gpu = 1;
828:       a->yys_gpu = 1;
829: #endif
830:       a->xx  = new thrust::host_vector<PetscScalar>(n);
831:       a->yy  = new thrust::host_vector<PetscScalar>(n);
832:       a->xxs = 1;
833:       a->yys = 1;
834:     }
835:     ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
836:   }
837:   a->multsetup = PETSC_TRUE;
838:   return(0);
839: }

841: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
842: {
843:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
844: #if defined(H2OPUS_USE_MPI)
845:   h2opusHandle_t handle = a->handle->handle;
846: #else
847:   h2opusHandle_t handle = a->handle;
848: #endif
849:   PetscBool      kernel = PETSC_FALSE;
850:   PetscBool      boundtocpu = PETSC_TRUE;
851:   PetscBool      samplingdone = PETSC_FALSE;
852:   MPI_Comm       comm;
853:   PetscMPIInt    size;

857:   PetscObjectGetComm((PetscObject)A,&comm);
858:   if (A->rmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
859:   if (A->rmap->N != A->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
860:   MPI_Comm_size(comm,&size);
861:   /* TODO REUSABILITY of geometric construction */
862:   delete a->hmatrix;
863:   delete a->dist_hmatrix;
864: #if defined(PETSC_H2OPUS_USE_GPU)
865:   delete a->hmatrix_gpu;
866:   delete a->dist_hmatrix_gpu;
867: #endif
868:   a->orthogonal = PETSC_FALSE;

870:   /* TODO: other? */
871:   H2OpusBoxCenterAdmissibility adm(a->eta);

873:   PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0);
874:   if (size > 1) {
875: #if defined(H2OPUS_USE_MPI)
876:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
877: #else
878:     a->dist_hmatrix = NULL;
879: #endif
880:   } else {
881:     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
882:   }
883:   MatH2OpusInferCoordinates_Private(A);
884:   if (!a->ptcloud) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
885:   if (a->kernel) {
886:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
887:     if (size > 1) {
888:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
889: #if defined(H2OPUS_USE_MPI)
890:       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
891: #endif
892:     } else {
893:       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
894:     }
895:     kernel = PETSC_TRUE;
896:   } else {
897:     if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
898:     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
899:   }
900:   MatSetUpMultiply_H2OPUS(A);

902: #if defined(PETSC_H2OPUS_USE_GPU)
903:   boundtocpu = A->boundtocpu;
904:   if (!boundtocpu) {
905:     if (size > 1) {
906:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
907: #if defined(H2OPUS_USE_MPI)
908:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
909: #endif
910:     } else {
911:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
912:     }
913:   }
914: #endif
915:   if (size == 1) {
916:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
917:       PetscReal Anorm;
918:       bool      verbose;

920:       PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL);
921:       verbose = a->hara_verbose;
922:       MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm);
923:       if (a->hara_verbose) { PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs); }
924:       if (a->sf && !a->nativemult) {
925:         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
926:       }
927:       a->sampler->SetStream(handle->getMainStream());
928:       if (boundtocpu) {
929:         a->sampler->SetGPUSampling(false);
930:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
931: #if defined(PETSC_H2OPUS_USE_GPU)
932:       } else {
933:         a->sampler->SetGPUSampling(true);
934:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
935: #endif
936:       }
937:       samplingdone = PETSC_TRUE;
938:     }
939:   }
940: #if defined(PETSC_H2OPUS_USE_GPU)
941:   if (!boundtocpu) {
942:     delete a->hmatrix;
943:     delete a->dist_hmatrix;
944:     a->hmatrix = NULL;
945:     a->dist_hmatrix = NULL;
946:   }
947:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
948: #endif
949:   PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0);

951:   if (!a->s) a->s = 1.0;
952:   A->assembled = PETSC_TRUE;

954:   if (samplingdone) {
955:     PetscBool check = a->check_construction;
956:     PetscBool checke = PETSC_FALSE;

958:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL);
959:     PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL);
960:     if (check) {
961:       Mat       E,Ae;
962:       PetscReal n1,ni,n2;
963:       PetscReal n1A,niA,n2A;
964:       void      (*normfunc)(void);

966:       Ae   = a->sampler->GetSamplingMat();
967:       MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
968:       MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
969:       MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
970:       MatNorm(E,NORM_1,&n1);
971:       MatNorm(E,NORM_INFINITY,&ni);
972:       MatNorm(E,NORM_2,&n2);
973:       if (checke) {
974:         Mat eA,eE,eAe;

976:         MatComputeOperator(A,MATAIJ,&eA);
977:         MatComputeOperator(E,MATAIJ,&eE);
978:         MatComputeOperator(Ae,MATAIJ,&eAe);
979:         MatChop(eA,PETSC_SMALL);
980:         MatChop(eE,PETSC_SMALL);
981:         MatChop(eAe,PETSC_SMALL);
982:         PetscObjectSetName((PetscObject)eA,"H2Mat");
983:         MatView(eA,NULL);
984:         PetscObjectSetName((PetscObject)eAe,"S");
985:         MatView(eAe,NULL);
986:         PetscObjectSetName((PetscObject)eE,"H2Mat - S");
987:         MatView(eE,NULL);
988:         MatDestroy(&eA);
989:         MatDestroy(&eE);
990:         MatDestroy(&eAe);
991:       }

993:       MatGetOperation(Ae,MATOP_NORM,&normfunc);
994:       MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
995:       MatNorm(Ae,NORM_1,&n1A);
996:       MatNorm(Ae,NORM_INFINITY,&niA);
997:       MatNorm(Ae,NORM_2,&n2A);
998:       n1A  = PetscMax(n1A,PETSC_SMALL);
999:       n2A  = PetscMax(n2A,PETSC_SMALL);
1000:       niA  = PetscMax(niA,PETSC_SMALL);
1001:       MatSetOperation(Ae,MATOP_NORM,normfunc);
1002:       PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
1003:       MatDestroy(&E);
1004:     }
1005:   }
1006:   return(0);
1007: }

1009: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1010: {
1012:   PetscMPIInt    size;
1013:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1016:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1017:   if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1018:   else {
1019:     a->hmatrix->clearData();
1020: #if defined(PETSC_H2OPUS_USE_GPU)
1021:     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1022: #endif
1023:   }
1024:   return(0);
1025: }

1027: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1028: {
1029:   Mat            A;
1030:   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1031: #if defined(PETSC_H2OPUS_USE_GPU)
1032:   PetscBool      iscpu = PETSC_FALSE;
1033: #else
1034:   PetscBool      iscpu = PETSC_TRUE;
1035: #endif
1037:   MPI_Comm       comm;

1040:   PetscObjectGetComm((PetscObject)B,&comm);
1041:   MatCreate(comm,&A);
1042:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1043:   MatSetType(A,MATH2OPUS);
1044:   MatPropagateSymmetryOptions(B,A);
1045:   a = (Mat_H2OPUS*)A->data;

1047:   a->eta              = b->eta;
1048:   a->leafsize         = b->leafsize;
1049:   a->basisord         = b->basisord;
1050:   a->max_rank         = b->max_rank;
1051:   a->bs               = b->bs;
1052:   a->rtol             = b->rtol;
1053:   a->norm_max_samples = b->norm_max_samples;
1054:   if (op == MAT_COPY_VALUES) a->s = b->s;

1056:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1057:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1059: #if defined(H2OPUS_USE_MPI)
1060:   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1061: #if defined(PETSC_H2OPUS_USE_GPU)
1062:   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1063: #endif
1064: #endif
1065:   if (b->hmatrix) {
1066:     a->hmatrix = new HMatrix(*b->hmatrix);
1067:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068:   }
1069: #if defined(PETSC_H2OPUS_USE_GPU)
1070:   if (b->hmatrix_gpu) {
1071:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1072:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073:   }
1074: #endif
1075:   if (b->sf) {
1076:     PetscObjectReference((PetscObject)b->sf);
1077:     a->sf = b->sf;
1078:   }
1079:   if (b->h2opus_indexmap) {
1080:     PetscObjectReference((PetscObject)b->h2opus_indexmap);
1081:     a->h2opus_indexmap = b->h2opus_indexmap;
1082:   }

1084:   MatSetUp(A);
1085:   MatSetUpMultiply_H2OPUS(A);
1086:   if (op == MAT_COPY_VALUES) {
1087:     A->assembled = PETSC_TRUE;
1088:     a->orthogonal = b->orthogonal;
1089: #if defined(PETSC_H2OPUS_USE_GPU)
1090:     A->offloadmask = B->offloadmask;
1091: #endif
1092:   }
1093: #if defined(PETSC_H2OPUS_USE_GPU)
1094:   iscpu = B->boundtocpu;
1095: #endif
1096:   MatBindToCPU(A,iscpu);

1098:   *nA = A;
1099:   return(0);
1100: }

1102: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1103: {
1104:   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1105:   PetscBool         isascii;
1106:   PetscErrorCode    ierr;
1107:   PetscMPIInt       size;
1108:   PetscViewerFormat format;

1111:   PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1112:   PetscViewerGetFormat(view,&format);
1113:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1114:   if (isascii) {
1115:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1116:       if (size == 1) {
1117:         FILE *fp;
1118:         PetscViewerASCIIGetPointer(view,&fp);
1119:         dumpHMatrix(*h2opus->hmatrix,6,fp);
1120:       }
1121:     } else {
1122:       PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat");
1123:       PetscViewerASCIIPrintf(view,"  PointCloud dim %D\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1124:       PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %D, eta %g\n",h2opus->leafsize,(double)h2opus->eta);
1125:       if (!h2opus->kernel) {
1126:         PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %D, samples %D, tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol);
1127:       } else {
1128:         PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %D\n",h2opus->basisord);
1129:       }
1130:       PetscViewerASCIIPrintf(view,"  Number of samples for norms %D\n",h2opus->norm_max_samples);
1131:       if (size == 1) {
1132:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1133:         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1134: #if defined(PETSC_HAVE_CUDA)
1135:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1136:         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1137: #endif
1138:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1139: #if defined(PETSC_HAVE_CUDA)
1140:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1141: #endif
1142:       } else {
1143: #if defined(PETSC_HAVE_CUDA)
1144:         double      matrix_mem[4] = {0.,0.,0.,0.};
1145:         PetscMPIInt rsize = 4;
1146: #else
1147:         double      matrix_mem[2] = {0.,0.};
1148:         PetscMPIInt rsize = 2;
1149: #endif
1150: #if defined(H2OPUS_USE_MPI)
1151:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1152:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1153: #if defined(PETSC_HAVE_CUDA)
1154:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1155:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1156: #endif
1157: #endif
1158:         MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A));
1159:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1160: #if defined(PETSC_HAVE_CUDA)
1161:         PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1162: #endif
1163:       }
1164:     }
1165:   }
1166: #if 0
1167:   if (size == 1) {
1168:     char filename[256];
1169:     const char *name;

1171:     PetscObjectGetName((PetscObject)A,&name);
1172:     PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name);
1173:     outputEps(*h2opus->hmatrix,filename);
1174:   }
1175: #endif
1176:   return(0);
1177: }

1179: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1180: {
1181:   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1182:   PetscReal      *gcoords;
1183:   PetscInt       N;
1184:   MPI_Comm       comm;
1185:   PetscMPIInt    size;
1186:   PetscBool      cong;

1190:   PetscLayoutSetUp(A->rmap);
1191:   PetscLayoutSetUp(A->cmap);
1192:   PetscObjectGetComm((PetscObject)A,&comm);
1193:   MatHasCongruentLayouts(A,&cong);
1194:   if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1195:   N    = A->rmap->N;
1196:   MPI_Comm_size(comm,&size);
1197:   if (size > 1 && cdist) {
1198:     PetscSF      sf;
1199:     MPI_Datatype dtype;

1201:     MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1202:     MPI_Type_commit(&dtype);

1204:     PetscSFCreate(comm,&sf);
1205:     PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1206:     PetscMalloc1(spacedim*N,&gcoords);
1207:     PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1208:     PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1209:     PetscSFDestroy(&sf);
1210:     MPI_Type_free(&dtype);
1211:   } else gcoords = (PetscReal*)coords;

1213:   delete h2opus->ptcloud;
1214:   delete h2opus->kernel;
1215:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1216:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1217:   if (gcoords != coords) { PetscFree(gcoords); }
1218:   A->preallocated = PETSC_TRUE;
1219:   return(0);
1220: }

1222: #if defined(PETSC_H2OPUS_USE_GPU)
1223: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1224: {
1225:   PetscMPIInt    size;
1226:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1230:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1231:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1232:     if (size > 1) {
1233:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1234: #if defined(H2OPUS_USE_MPI)
1235:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1236:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1237: #endif
1238:     } else {
1239:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1240:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1241:       else *a->hmatrix = *a->hmatrix_gpu;
1242:     }
1243:     delete a->hmatrix_gpu;
1244:     delete a->dist_hmatrix_gpu;
1245:     a->hmatrix_gpu = NULL;
1246:     a->dist_hmatrix_gpu = NULL;
1247:     A->offloadmask = PETSC_OFFLOAD_CPU;
1248:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1249:     if (size > 1) {
1250:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1251: #if defined(H2OPUS_USE_MPI)
1252:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1253:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1254: #endif
1255:     } else {
1256:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1257:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1258:       else *a->hmatrix_gpu = *a->hmatrix;
1259:     }
1260:     delete a->hmatrix;
1261:     delete a->dist_hmatrix;
1262:     a->hmatrix = NULL;
1263:     a->dist_hmatrix = NULL;
1264:     A->offloadmask = PETSC_OFFLOAD_GPU;
1265:   }
1266:   PetscFree(A->defaultvectype);
1267:   if (!flg) {
1268:     PetscStrallocpy(VECCUDA,&A->defaultvectype);
1269:   } else {
1270:     PetscStrallocpy(VECSTANDARD,&A->defaultvectype);
1271:   }
1272:   A->boundtocpu = flg;
1273:   return(0);
1274: }
1275: #endif

1277: /*MC
1278:      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.

1280:    Options Database Keys:
1281: .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()

1283:    Notes:
1284:      H2Opus implements hierarchical matrices in the H^2 flavour.
1285:      It supports CPU or NVIDIA GPUs.
1286:      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1287:      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1288:      For details and additional references, see
1289:        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1290:      available at https://arxiv.org/abs/2109.05451.

1292:    Level: beginner

1294: .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1295: M*/
1296: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1297: {
1298:   Mat_H2OPUS     *a;
1300:   PetscMPIInt    size;

1303: #if defined(PETSC_H2OPUS_USE_GPU)
1304:   PetscCUDAInitializeCheck();
1305: #endif
1306:   PetscNewLog(A,&a);
1307:   A->data = (void*)a;

1309:   a->eta              = 0.9;
1310:   a->leafsize         = 32;
1311:   a->basisord         = 4;
1312:   a->max_rank         = 64;
1313:   a->bs               = 32;
1314:   a->rtol             = 1.e-4;
1315:   a->s                = 1.0;
1316:   a->norm_max_samples = 10;
1317: #if defined(H2OPUS_USE_MPI)
1318:   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1319: #else
1320:   h2opusCreateHandle(&a->handle);
1321: #endif
1322:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1323:   PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS);
1324:   PetscMemzero(A->ops,sizeof(struct _MatOps));

1326:   A->ops->destroy          = MatDestroy_H2OPUS;
1327:   A->ops->view             = MatView_H2OPUS;
1328:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1329:   A->ops->mult             = MatMult_H2OPUS;
1330:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1331:   A->ops->multadd          = MatMultAdd_H2OPUS;
1332:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1333:   A->ops->scale            = MatScale_H2OPUS;
1334:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1335:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1336:   A->ops->norm             = MatNorm_H2OPUS;
1337:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1338: #if defined(PETSC_H2OPUS_USE_GPU)
1339:   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1340: #endif

1342:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS);
1343:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS);
1344:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS);
1345:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS);
1346: #if defined(PETSC_H2OPUS_USE_GPU)
1347:   PetscFree(A->defaultvectype);
1348:   PetscStrallocpy(VECCUDA,&A->defaultvectype);
1349: #endif
1350:   return(0);
1351: }

1353: /*@C
1354:      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1356:    Input Parameter:
1357: .     A - the matrix

1359:    Level: intermediate

1361: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1362: */
1363: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1364: {
1366:   PetscBool      ish2opus;
1367:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1368:   PetscMPIInt    size;
1369:   PetscBool      boundtocpu = PETSC_TRUE;

1374:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1375:   if (!ish2opus) return(0);
1376:   if (a->orthogonal) return(0);
1377:   HLibProfile::clear();
1378:   PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0);
1379: #if defined(PETSC_H2OPUS_USE_GPU)
1380:   boundtocpu = A->boundtocpu;
1381: #endif
1382:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1383:   if (size > 1) {
1384:     if (boundtocpu) {
1385:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1386: #if defined(H2OPUS_USE_MPI)
1387:       distributed_horthog(*a->dist_hmatrix, a->handle);
1388: #endif
1389: #if defined(PETSC_H2OPUS_USE_GPU)
1390:       A->offloadmask = PETSC_OFFLOAD_CPU;
1391:     } else {
1392:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1393:       PetscLogGpuTimeBegin();
1394: #if defined(H2OPUS_USE_MPI)
1395:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1396: #endif
1397:       PetscLogGpuTimeEnd();
1398: #endif
1399:     }
1400:   } else {
1401: #if defined(H2OPUS_USE_MPI)
1402:     h2opusHandle_t handle = a->handle->handle;
1403: #else
1404:     h2opusHandle_t handle = a->handle;
1405: #endif
1406:     if (boundtocpu) {
1407:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1408:       horthog(*a->hmatrix, handle);
1409: #if defined(PETSC_H2OPUS_USE_GPU)
1410:       A->offloadmask = PETSC_OFFLOAD_CPU;
1411:     } else {
1412:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1413:       PetscLogGpuTimeBegin();
1414:       horthog(*a->hmatrix_gpu, handle);
1415:       PetscLogGpuTimeEnd();
1416: #endif
1417:     }
1418:   }
1419:   a->orthogonal = PETSC_TRUE;
1420:   { /* log flops */
1421:     double gops,time,perf,dev;
1422:     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1423: #if defined(PETSC_H2OPUS_USE_GPU)
1424:     if (boundtocpu) {
1425:       PetscLogFlops(1e9*gops);
1426:     } else {
1427:       PetscLogGpuFlops(1e9*gops);
1428:     }
1429: #else
1430:     PetscLogFlops(1e9*gops);
1431: #endif
1432:   }
1433:   PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0);
1434:   return(0);
1435: }

1437: /*@C
1438:      MatH2OpusCompress - Compress a hierarchical matrix.

1440:    Input Parameters:
1441: +     A - the matrix
1442: -     tol - the absolute truncation threshold

1444:    Level: intermediate

1446: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1447: */
1448: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1449: {
1451:   PetscBool      ish2opus;
1452:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1453:   PetscMPIInt    size;
1454:   PetscBool      boundtocpu = PETSC_TRUE;

1459:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1460:   if (!ish2opus) return(0);
1461:   MatH2OpusOrthogonalize(A);
1462:   HLibProfile::clear();
1463:   PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0);
1464: #if defined(PETSC_H2OPUS_USE_GPU)
1465:   boundtocpu = A->boundtocpu;
1466: #endif
1467:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1468:   if (size > 1) {
1469:     if (boundtocpu) {
1470:       if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1471: #if defined(H2OPUS_USE_MPI)
1472:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1473: #endif
1474: #if defined(PETSC_H2OPUS_USE_GPU)
1475:       A->offloadmask = PETSC_OFFLOAD_CPU;
1476:     } else {
1477:       if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1478:       PetscLogGpuTimeBegin();
1479: #if defined(H2OPUS_USE_MPI)
1480:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1481: #endif
1482:       PetscLogGpuTimeEnd();
1483: #endif
1484:     }
1485:   } else {
1486: #if defined(H2OPUS_USE_MPI)
1487:     h2opusHandle_t handle = a->handle->handle;
1488: #else
1489:     h2opusHandle_t handle = a->handle;
1490: #endif
1491:     if (boundtocpu) {
1492:       if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1493:       hcompress(*a->hmatrix, tol, handle);
1494: #if defined(PETSC_H2OPUS_USE_GPU)
1495:       A->offloadmask = PETSC_OFFLOAD_CPU;
1496:     } else {
1497:       if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1498:       PetscLogGpuTimeBegin();
1499:       hcompress(*a->hmatrix_gpu, tol, handle);
1500:       PetscLogGpuTimeEnd();
1501: #endif
1502:     }
1503:   }
1504:   { /* log flops */
1505:     double gops,time,perf,dev;
1506:     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1507: #if defined(PETSC_H2OPUS_USE_GPU)
1508:     if (boundtocpu) {
1509:       PetscLogFlops(1e9*gops);
1510:     } else {
1511:       PetscLogGpuFlops(1e9*gops);
1512:     }
1513: #else
1514:     PetscLogFlops(1e9*gops);
1515: #endif
1516:   }
1517:   PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0);
1518:   return(0);
1519: }

1521: /*@C
1522:      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.

1524:    Input Parameters:
1525: +     A - the hierarchical matrix
1526: .     B - the matrix to be sampled
1527: .     bs - maximum number of samples to be taken concurrently
1528: -     tol - relative tolerance for construction

1530:    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.

1532:    Level: intermediate

1534: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1535: */
1536: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1537: {
1538:   PetscBool      ish2opus;

1545:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1546:   if (ish2opus) {
1547:     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;

1549:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1550:     a->sampler->SetSamplingMat(B);
1551:     if (bs > 0) a->bs = bs;
1552:     if (tol > 0.) a->rtol = tol;
1553:     delete a->kernel;
1554:   }
1555:   return(0);
1556: }

1558: /*@C
1559:      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.

1561:    Input Parameters:
1562: +     comm - MPI communicator
1563: .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1564: .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1565: .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1566: .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1567: .     spacedim - dimension of the space coordinates
1568: .     coords - coordinates of the points
1569: .     cdist - whether or not coordinates are distributed
1570: .     kernel - computational kernel (or NULL)
1571: .     kernelctx - kernel context
1572: .     eta - admissibility condition tolerance
1573: .     leafsize - leaf size in cluster tree
1574: -     basisord - approximation order for Chebychev interpolation of low-rank blocks

1576:    Output Parameter:
1577: .     nA - matrix

1579:    Options Database Keys:
1580: +     -mat_h2opus_leafsize <PetscInt>
1581: .     -mat_h2opus_eta <PetscReal>
1582: .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1583: -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms

1585:    Level: intermediate

1587: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1588: @*/
1589: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1590: {
1591:   Mat            A;
1592:   Mat_H2OPUS     *h2opus;
1593: #if defined(PETSC_H2OPUS_USE_GPU)
1594:   PetscBool      iscpu = PETSC_FALSE;
1595: #else
1596:   PetscBool      iscpu = PETSC_TRUE;
1597: #endif

1601:   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1602:   MatCreate(comm,&A);
1603:   MatSetSizes(A,m,n,M,N);
1604:   if (M != N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1605:   MatSetType(A,MATH2OPUS);
1606:   MatBindToCPU(A,iscpu);
1607:   MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx);

1609:   h2opus = (Mat_H2OPUS*)A->data;
1610:   if (eta > 0.) h2opus->eta = eta;
1611:   if (leafsize > 0) h2opus->leafsize = leafsize;
1612:   if (basisord > 0) h2opus->basisord = basisord;

1614:   *nA = A;
1615:   return(0);
1616: }

1618: /*@C
1619:      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.

1621:    Input Parameters:
1622: +     B - the matrix to be sampled
1623: .     spacedim - dimension of the space coordinates
1624: .     coords - coordinates of the points
1625: .     cdist - whether or not coordinates are distributed
1626: .     eta - admissibility condition tolerance
1627: .     leafsize - leaf size in cluster tree
1628: .     maxrank - maximum rank allowed
1629: .     bs - maximum number of samples to be taken concurrently
1630: -     rtol - relative tolerance for construction

1632:    Output Parameter:
1633: .     nA - matrix

1635:    Options Database Keys:
1636: +     -mat_h2opus_leafsize <PetscInt>
1637: .     -mat_h2opus_eta <PetscReal>
1638: .     -mat_h2opus_maxrank <PetscInt>
1639: .     -mat_h2opus_samples <PetscInt>
1640: .     -mat_h2opus_rtol <PetscReal>
1641: .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1642: .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1643: -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms

1645:    Notes: not available in parallel

1647:    Level: intermediate

1649: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1650: @*/
1651: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1652: {
1653:   Mat            A;
1654:   Mat_H2OPUS     *h2opus;
1655:   MPI_Comm       comm;
1656:   PetscBool      boundtocpu = PETSC_TRUE;

1668:   PetscObjectGetComm((PetscObject)B,&comm);
1669:   if (B->rmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1670:   if (B->rmap->N != B->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1671:   MatCreate(comm,&A);
1672:   MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1673: #if defined(PETSC_H2OPUS_USE_GPU)
1674:   {
1675:     PetscBool iscuda;
1676:     VecType   vtype;

1678:     MatGetVecType(B,&vtype);
1679:     PetscStrcmp(vtype,VECCUDA,&iscuda);
1680:     if (!iscuda) {
1681:       PetscStrcmp(vtype,VECSEQCUDA,&iscuda);
1682:       if (!iscuda) {
1683:         PetscStrcmp(vtype,VECMPICUDA,&iscuda);
1684:       }
1685:     }
1686:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1687:   }
1688: #endif
1689:   MatSetType(A,MATH2OPUS);
1690:   MatBindToCPU(A,boundtocpu);
1691:   if (spacedim) {
1692:     MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL);
1693:   }
1694:   MatPropagateSymmetryOptions(B,A);
1695:   /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1697:   h2opus = (Mat_H2OPUS*)A->data;
1698:   h2opus->sampler = new PetscMatrixSampler(B);
1699:   if (eta > 0.) h2opus->eta = eta;
1700:   if (leafsize > 0) h2opus->leafsize = leafsize;
1701:   if (maxrank > 0) h2opus->max_rank = maxrank;
1702:   if (bs > 0) h2opus->bs = bs;
1703:   if (rtol > 0.) h2opus->rtol = rtol;
1704:   *nA = A;
1705:   A->preallocated = PETSC_TRUE;
1706:   return(0);
1707: }

1709: /*@C
1710:      MatH2OpusGetIndexMap - Access reordering index set.

1712:    Input Parameters:
1713: .     A - the matrix

1715:    Output Parameter:
1716: .     indexmap - the index set for the reordering

1718:    Level: intermediate

1720: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1721: @*/
1722: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1723: {
1724:   PetscBool      ish2opus;
1725:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;

1732:   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1733:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1734:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1735:   *indexmap = a->h2opus_indexmap;
1736:   return(0);
1737: }

1739: /*@C
1740:      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1742:    Input Parameters:
1743: +     A - the matrix
1744: .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1745: -     in - the vector to be mapped

1747:    Output Parameter:
1748: .     out - the newly created mapped vector

1750:    Level: intermediate

1752: .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1753: */
1754: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1755: {
1756:   PetscBool      ish2opus;
1757:   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1758:   PetscScalar    *xin,*xout;
1759:   PetscBool      nm;

1768:   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1769:   PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1770:   if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1771:   nm   = a->nativemult;
1772:   MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc);
1773:   MatCreateVecs(A,out,NULL);
1774:   MatH2OpusSetNativeMult(A,nm);
1775:   if (!a->sf) { /* same ordering */
1776:     VecCopy(in,*out);
1777:     return(0);
1778:   }
1779:   VecGetArrayRead(in,(const PetscScalar**)&xin);
1780:   VecGetArrayWrite(*out,&xout);
1781:   if (nativetopetsc) {
1782:     PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1783:     PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1784:   } else {
1785:     PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1786:     PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1787:   }
1788:   VecRestoreArrayRead(in,(const PetscScalar**)&xin);
1789:   VecRestoreArrayWrite(*out,&xout);
1790:   return(0);
1791: }
1792: #endif