Actual source code: math2opus.cu

  1: #include <h2opusconf.h>
  2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
  4:   #include <h2opus.h>
  5:   #if defined(H2OPUS_USE_MPI)
  6:     #include <h2opus/distributed/distributed_h2opus_handle.h>
  7:     #include <h2opus/distributed/distributed_geometric_construction.h>
  8:     #include <h2opus/distributed/distributed_hgemv.h>
  9:     #include <h2opus/distributed/distributed_horthog.h>
 10:     #include <h2opus/distributed/distributed_hcompress.h>
 11:   #endif
 12:   #include <h2opus/util/boxentrygen.h>
 13: #include <petsc/private/matimpl.h>
 14: #include <petsc/private/vecimpl.h>
 15: #include <petsc/private/deviceimpl.h>
 16: #include <petscsf.h>

 18: /* math2opusutils */
 19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
 20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
 21: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
 22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
 23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);

 25:   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())

 27:   /* Use GPU only if H2OPUS is configured for GPU */
 28:   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
 29:     #define PETSC_H2OPUS_USE_GPU
 30:   #endif
 31:   #if defined(PETSC_H2OPUS_USE_GPU)
 32:     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
 33:   #else
 34:     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
 35:   #endif

 37: // TODO H2OPUS:
 38: // DistributedHMatrix
 39: //   unsymmetric ?
 40: //   transpose for distributed_hgemv?
 41: //   clearData()
 42: // Unify interface for sequential and parallel?
 43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
 44: //
 45: template <class T>
 46: class PetscPointCloud : public H2OpusDataSet<T> {
 47: private:
 48:   int            dimension;
 49:   size_t         num_points;
 50:   std::vector<T> pts;

 52: public:
 53:   PetscPointCloud(int dim, size_t num_pts, const T coords[])
 54:   {
 55:     dim              = dim > 0 ? dim : 1;
 56:     this->dimension  = dim;
 57:     this->num_points = num_pts;

 59:     pts.resize(num_pts * dim);
 60:     if (coords) {
 61:       for (size_t n = 0; n < num_pts; n++)
 62:         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
 63:     } else {
 64:       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
 65:       for (size_t n = 0; n < num_pts; n++) {
 66:         pts[n * dim] = n * h;
 67:         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
 68:       }
 69:     }
 70:   }

 72:   PetscPointCloud(const PetscPointCloud<T> &other)
 73:   {
 74:     size_t N         = other.dimension * other.num_points;
 75:     this->dimension  = other.dimension;
 76:     this->num_points = other.num_points;
 77:     this->pts.resize(N);
 78:     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
 79:   }

 81:   int getDimension() const { return dimension; }

 83:   size_t getDataSetSize() const { return num_points; }

 85:   T getDataPoint(size_t idx, int dim) const
 86:   {
 87:     assert(dim < dimension && idx < num_points);
 88:     return pts[idx * dimension + dim];
 89:   }

 91:   void Print(std::ostream &out = std::cout)
 92:   {
 93:     out << "Dimension: " << dimension << std::endl;
 94:     out << "NumPoints: " << num_points << std::endl;
 95:     for (size_t n = 0; n < num_points; n++) {
 96:       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
 97:       out << std::endl;
 98:     }
 99:   }
100: };

102: template <class T>
103: class PetscFunctionGenerator {
104: private:
105:   MatH2OpusKernel k;
106:   int             dim;
107:   void           *ctx;

109: public:
110:   PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111:   {
112:     this->k   = k;
113:     this->dim = dim;
114:     this->ctx = ctx;
115:   }
116:   PetscFunctionGenerator(PetscFunctionGenerator &other)
117:   {
118:     this->k   = other.k;
119:     this->dim = other.dim;
120:     this->ctx = other.ctx;
121:   }
122:   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123: };

125: #include <../src/mat/impls/h2opus/math2opussampler.hpp>

127:   /* just to not clutter the code */
128:   #if !defined(H2OPUS_USE_GPU)
129: typedef HMatrix HMatrix_GPU;
130:     #if defined(H2OPUS_USE_MPI)
131: typedef DistributedHMatrix DistributedHMatrix_GPU;
132:     #endif
133:   #endif

135: typedef struct {
136:   #if defined(H2OPUS_USE_MPI)
137:   distributedH2OpusHandle_t handle;
138:   #else
139:   h2opusHandle_t handle;
140:   #endif
141:   /* Sequential and parallel matrices are two different classes at the moment */
142:   HMatrix *hmatrix;
143:   #if defined(H2OPUS_USE_MPI)
144:   DistributedHMatrix *dist_hmatrix;
145:   #else
146:   HMatrix       *dist_hmatrix;     /* just to not clutter the code */
147:   #endif
148:   /* May use permutations */
149:   PetscSF                           sf;
150:   PetscLayout                       h2opus_rmap, h2opus_cmap;
151:   IS                                h2opus_indexmap;
152:   thrust::host_vector<PetscScalar> *xx, *yy;
153:   PetscInt                          xxs, yys;
154:   PetscBool                         multsetup;

156:   /* GPU */
157:   HMatrix_GPU *hmatrix_gpu;
158:   #if defined(H2OPUS_USE_MPI)
159:   DistributedHMatrix_GPU *dist_hmatrix_gpu;
160:   #else
161:   HMatrix_GPU   *dist_hmatrix_gpu; /* just to not clutter the code */
162:   #endif
163:   #if defined(PETSC_H2OPUS_USE_GPU)
164:   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165:   PetscInt                            xxs_gpu, yys_gpu;
166:   #endif

168:   /* construction from matvecs */
169:   PetscMatrixSampler *sampler;
170:   PetscBool           nativemult;

172:   /* Admissibility */
173:   PetscReal eta;
174:   PetscInt  leafsize;

176:   /* for dof reordering */
177:   PetscPointCloud<PetscReal> *ptcloud;

179:   /* kernel for generating matrix entries */
180:   PetscFunctionGenerator<PetscScalar> *kernel;

182:   /* basis orthogonalized? */
183:   PetscBool orthogonal;

185:   /* customization */
186:   PetscInt  basisord;
187:   PetscInt  max_rank;
188:   PetscInt  bs;
189:   PetscReal rtol;
190:   PetscInt  norm_max_samples;
191:   PetscBool check_construction;
192:   PetscBool hara_verbose;
193:   PetscBool resize;

195:   /* keeps track of MatScale values */
196:   PetscScalar s;
197: } Mat_H2OPUS;

199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

203:   PetscFunctionBegin;
204:   #if defined(H2OPUS_USE_MPI)
205:   h2opusDestroyDistributedHandle(a->handle);
206:   #else
207:   h2opusDestroyHandle(a->handle);
208:   #endif
209:   delete a->dist_hmatrix;
210:   delete a->hmatrix;
211:   PetscCall(PetscSFDestroy(&a->sf));
212:   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213:   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214:   PetscCall(ISDestroy(&a->h2opus_indexmap));
215:   delete a->xx;
216:   delete a->yy;
217:   delete a->hmatrix_gpu;
218:   delete a->dist_hmatrix_gpu;
219:   #if defined(PETSC_H2OPUS_USE_GPU)
220:   delete a->xx_gpu;
221:   delete a->yy_gpu;
222:   #endif
223:   delete a->sampler;
224:   delete a->ptcloud;
225:   delete a->kernel;
226:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
227:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
228:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
229:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
230:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
231:   PetscCall(PetscFree(A->data));
232:   PetscFunctionReturn(PETSC_SUCCESS);
233: }

235: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236: {
237:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
238:   PetscBool   ish2opus;

240:   PetscFunctionBegin;
243:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
244:   if (ish2opus) {
245:     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246:       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247:         PetscLayout t;
248:         t              = A->rmap;
249:         A->rmap        = a->h2opus_rmap;
250:         a->h2opus_rmap = t;
251:         t              = A->cmap;
252:         A->cmap        = a->h2opus_cmap;
253:         a->h2opus_cmap = t;
254:       }
255:     }
256:     a->nativemult = nm;
257:   }
258:   PetscFunctionReturn(PETSC_SUCCESS);
259: }

261: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262: {
263:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
264:   PetscBool   ish2opus;

266:   PetscFunctionBegin;
268:   PetscAssertPointer(nm, 2);
269:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
270:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
271:   *nm = a->nativemult;
272:   PetscFunctionReturn(PETSC_SUCCESS);
273: }

275: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
276: {
277:   PetscBool   ish2opus;
278:   PetscInt    nmax = PETSC_DECIDE;
279:   Mat_H2OPUS *a    = NULL;
280:   PetscBool   mult = PETSC_FALSE;

282:   PetscFunctionBegin;
283:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
284:   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285:     a = (Mat_H2OPUS *)A->data;

287:     nmax = a->norm_max_samples;
288:     mult = a->nativemult;
289:     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
290:   } else {
291:     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
292:   }
293:   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
294:   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
295:   PetscFunctionReturn(PETSC_SUCCESS);
296: }

298: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299: {
300:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
301:   PetscInt    n;
302:   PetscBool   boundtocpu = PETSC_TRUE;

304:   PetscFunctionBegin;
305:   #if defined(PETSC_H2OPUS_USE_GPU)
306:   boundtocpu = A->boundtocpu;
307:   #endif
308:   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
309:   if (boundtocpu) {
310:     if (h2opus->xxs < xN) {
311:       h2opus->xx->resize(n * xN);
312:       h2opus->xxs = xN;
313:     }
314:     if (h2opus->yys < yN) {
315:       h2opus->yy->resize(n * yN);
316:       h2opus->yys = yN;
317:     }
318:   }
319:   #if defined(PETSC_H2OPUS_USE_GPU)
320:   if (!boundtocpu) {
321:     if (h2opus->xxs_gpu < xN) {
322:       h2opus->xx_gpu->resize(n * xN);
323:       h2opus->xxs_gpu = xN;
324:     }
325:     if (h2opus->yys_gpu < yN) {
326:       h2opus->yy_gpu->resize(n * yN);
327:       h2opus->yys_gpu = yN;
328:     }
329:   }
330:   #endif
331:   PetscFunctionReturn(PETSC_SUCCESS);
332: }

334: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
335: {
336:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
337:   #if defined(H2OPUS_USE_MPI)
338:   h2opusHandle_t handle = h2opus->handle->handle;
339:   #else
340:   h2opusHandle_t handle = h2opus->handle;
341:   #endif
342:   PetscBool    boundtocpu = PETSC_TRUE;
343:   PetscScalar *xx, *yy, *uxx, *uyy;
344:   PetscInt     blda, clda;
345:   PetscMPIInt  size;
346:   PetscSF      bsf, csf;
347:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

349:   PetscFunctionBegin;
350:   HLibProfile::clear();
351:   #if defined(PETSC_H2OPUS_USE_GPU)
352:   boundtocpu = A->boundtocpu;
353:   #endif
354:   PetscCall(MatDenseGetLDA(B, &blda));
355:   PetscCall(MatDenseGetLDA(C, &clda));
356:   if (usesf) {
357:     PetscInt n;

359:     PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf));
360:     PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf));

362:     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
363:     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
364:     blda = n;
365:     clda = n;
366:   }
367:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
368:   if (boundtocpu) {
369:     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
370:     PetscCall(MatDenseGetArrayWrite(C, &yy));
371:     if (usesf) {
372:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
373:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
374:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
376:     } else {
377:       uxx = xx;
378:       uyy = yy;
379:     }
380:     if (size > 1) {
381:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
382:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
383:   #if defined(H2OPUS_USE_MPI)
384:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
385:   #endif
386:     } else {
387:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
388:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
389:     }
390:     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
391:     if (usesf) {
392:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
394:     }
395:     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
396:   #if defined(PETSC_H2OPUS_USE_GPU)
397:   } else {
398:     PetscBool ciscuda, biscuda;

400:     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
401:     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
402:     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
403:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
404:     if (!ciscuda) {
405:       C->assembled = PETSC_TRUE;
406:       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
407:     }
408:     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
409:     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
410:     if (usesf) {
411:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
412:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
413:       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414:       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
415:     } else {
416:       uxx = xx;
417:       uyy = yy;
418:     }
419:     PetscCall(PetscLogGpuTimeBegin());
420:     if (size > 1) {
421:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
422:       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
423:     #if defined(H2OPUS_USE_MPI)
424:       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
425:     #endif
426:     } else {
427:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
428:       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
429:     }
430:     PetscCall(PetscLogGpuTimeEnd());
431:     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
432:     if (usesf) {
433:       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434:       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
435:     }
436:     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
437:     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
438:     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
439:   #endif
440:   }
441:   { /* log flops */
442:     double gops, time, perf, dev;
443:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
444:   #if defined(PETSC_H2OPUS_USE_GPU)
445:     if (boundtocpu) {
446:       PetscCall(PetscLogFlops(1e9 * gops));
447:     } else {
448:       PetscCall(PetscLogGpuFlops(1e9 * gops));
449:     }
450:   #else
451:     PetscCall(PetscLogFlops(1e9 * gops));
452:   #endif
453:   }
454:   PetscFunctionReturn(PETSC_SUCCESS);
455: }

457: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
458: {
459:   Mat_Product *product = C->product;

461:   PetscFunctionBegin;
462:   MatCheckProduct(C, 1);
463:   switch (product->type) {
464:   case MATPRODUCT_AB:
465:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
466:     break;
467:   case MATPRODUCT_AtB:
468:     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
469:     break;
470:   default:
471:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
472:   }
473:   PetscFunctionReturn(PETSC_SUCCESS);
474: }

476: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
477: {
478:   Mat_Product *product = C->product;
479:   PetscBool    cisdense;
480:   Mat          A, B;

482:   PetscFunctionBegin;
483:   MatCheckProduct(C, 1);
484:   A = product->A;
485:   B = product->B;
486:   switch (product->type) {
487:   case MATPRODUCT_AB:
488:     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
489:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
490:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
491:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
492:     PetscCall(MatSetUp(C));
493:     break;
494:   case MATPRODUCT_AtB:
495:     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
496:     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
497:     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
498:     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
499:     PetscCall(MatSetUp(C));
500:     break;
501:   default:
502:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
503:   }
504:   C->ops->productsymbolic = NULL;
505:   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
506:   PetscFunctionReturn(PETSC_SUCCESS);
507: }

509: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
510: {
511:   PetscFunctionBegin;
512:   MatCheckProduct(C, 1);
513:   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
514:   PetscFunctionReturn(PETSC_SUCCESS);
515: }

517: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
518: {
519:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
520:   #if defined(H2OPUS_USE_MPI)
521:   h2opusHandle_t handle = h2opus->handle->handle;
522:   #else
523:   h2opusHandle_t handle = h2opus->handle;
524:   #endif
525:   PetscBool    boundtocpu = PETSC_TRUE;
526:   PetscInt     n;
527:   PetscScalar *xx, *yy, *uxx, *uyy;
528:   PetscMPIInt  size;
529:   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);

531:   PetscFunctionBegin;
532:   HLibProfile::clear();
533:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
534:   #if defined(PETSC_H2OPUS_USE_GPU)
535:   boundtocpu = A->boundtocpu;
536:   #endif
537:   if (usesf) {
538:     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
539:   } else n = A->rmap->n;
540:   if (boundtocpu) {
541:     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
542:     if (sy == 0.0) {
543:       PetscCall(VecGetArrayWrite(y, &yy));
544:     } else {
545:       PetscCall(VecGetArray(y, &yy));
546:     }
547:     if (usesf) {
548:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
549:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);

551:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
553:       if (sy != 0.0) {
554:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
556:       }
557:     } else {
558:       uxx = xx;
559:       uyy = yy;
560:     }
561:     if (size > 1) {
562:       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
563:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
564:   #if defined(H2OPUS_USE_MPI)
565:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
566:   #endif
567:     } else {
568:       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
569:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
570:     }
571:     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
572:     if (usesf) {
573:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
575:     }
576:     if (sy == 0.0) {
577:       PetscCall(VecRestoreArrayWrite(y, &yy));
578:     } else {
579:       PetscCall(VecRestoreArray(y, &yy));
580:     }
581:   #if defined(PETSC_H2OPUS_USE_GPU)
582:   } else {
583:     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
584:     if (sy == 0.0) {
585:       PetscCall(VecCUDAGetArrayWrite(y, &yy));
586:     } else {
587:       PetscCall(VecCUDAGetArray(y, &yy));
588:     }
589:     if (usesf) {
590:       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
591:       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);

593:       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594:       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
595:       if (sy != 0.0) {
596:         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597:         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
598:       }
599:     } else {
600:       uxx = xx;
601:       uyy = yy;
602:     }
603:     PetscCall(PetscLogGpuTimeBegin());
604:     if (size > 1) {
605:       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
606:       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
607:     #if defined(H2OPUS_USE_MPI)
608:       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
609:     #endif
610:     } else {
611:       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
612:       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
613:     }
614:     PetscCall(PetscLogGpuTimeEnd());
615:     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
616:     if (usesf) {
617:       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618:       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
619:     }
620:     if (sy == 0.0) {
621:       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
622:     } else {
623:       PetscCall(VecCUDARestoreArray(y, &yy));
624:     }
625:   #endif
626:   }
627:   { /* log flops */
628:     double gops, time, perf, dev;
629:     HLibProfile::getHgemvPerf(gops, time, perf, dev);
630:   #if defined(PETSC_H2OPUS_USE_GPU)
631:     if (boundtocpu) {
632:       PetscCall(PetscLogFlops(1e9 * gops));
633:     } else {
634:       PetscCall(PetscLogGpuFlops(1e9 * gops));
635:     }
636:   #else
637:     PetscCall(PetscLogFlops(1e9 * gops));
638:   #endif
639:   }
640:   PetscFunctionReturn(PETSC_SUCCESS);
641: }

643: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
644: {
645:   PetscBool xiscuda, yiscuda;

647:   PetscFunctionBegin;
648:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
649:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
650:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
651:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
652:   PetscFunctionReturn(PETSC_SUCCESS);
653: }

655: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656: {
657:   PetscBool xiscuda, yiscuda;

659:   PetscFunctionBegin;
660:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
661:   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
662:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
663:   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
664:   PetscFunctionReturn(PETSC_SUCCESS);
665: }

667: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668: {
669:   PetscBool xiscuda, ziscuda;

671:   PetscFunctionBegin;
672:   PetscCall(VecCopy(y, z));
673:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
674:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
675:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
676:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
677:   PetscFunctionReturn(PETSC_SUCCESS);
678: }

680: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
681: {
682:   PetscBool xiscuda, ziscuda;

684:   PetscFunctionBegin;
685:   PetscCall(VecCopy(y, z));
686:   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
687:   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
688:   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
689:   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
690:   PetscFunctionReturn(PETSC_SUCCESS);
691: }

693: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
694: {
695:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

697:   PetscFunctionBegin;
698:   a->s *= s;
699:   PetscFunctionReturn(PETSC_SUCCESS);
700: }

702: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
703: {
704:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

706:   PetscFunctionBegin;
707:   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
708:   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
709:   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
710:   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
711:   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
712:   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
713:   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
714:   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
715:   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
716:   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
717:   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
718:   PetscOptionsHeadEnd();
719:   PetscFunctionReturn(PETSC_SUCCESS);
720: }

722: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);

724: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
725: {
726:   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
727:   Vec                c;
728:   PetscInt           spacedim;
729:   const PetscScalar *coords;

731:   PetscFunctionBegin;
732:   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
733:   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
734:   if (!c && a->sampler) {
735:     Mat S = a->sampler->GetSamplingMat();

737:     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
738:   }
739:   if (!c) {
740:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
741:   } else {
742:     PetscCall(VecGetArrayRead(c, &coords));
743:     PetscCall(VecGetBlockSize(c, &spacedim));
744:     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
745:     PetscCall(VecRestoreArrayRead(c, &coords));
746:   }
747:   PetscFunctionReturn(PETSC_SUCCESS);
748: }

750: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
751: {
752:   MPI_Comm      comm;
753:   PetscMPIInt   size;
754:   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
755:   PetscInt      n = 0, *idx = NULL;
756:   int          *iidx = NULL;
757:   PetscCopyMode own;
758:   PetscBool     rid;

760:   PetscFunctionBegin;
761:   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
762:   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
763:     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
764:   #if defined(PETSC_H2OPUS_USE_GPU)
765:     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
766:     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
767:     a->xxs_gpu = 1;
768:     a->yys_gpu = 1;
769:   #endif
770:     a->xx  = new thrust::host_vector<PetscScalar>(n);
771:     a->yy  = new thrust::host_vector<PetscScalar>(n);
772:     a->xxs = 1;
773:     a->yys = 1;
774:   } else {
775:     IS is;
776:     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
777:     PetscCallMPI(MPI_Comm_size(comm, &size));
778:     if (!a->h2opus_indexmap) {
779:       if (size > 1) {
780:         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
781:   #if defined(H2OPUS_USE_MPI)
782:         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
783:         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
784:   #endif
785:       } else {
786:         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
787:         n    = a->hmatrix->u_basis_tree.index_map.size();
788:       }

790:       if (PetscDefined(USE_64BIT_INDICES)) {
791:         PetscInt i;

793:         own = PETSC_OWN_POINTER;
794:         PetscCall(PetscMalloc1(n, &idx));
795:         for (i = 0; i < n; i++) idx[i] = iidx[i];
796:       } else {
797:         own = PETSC_COPY_VALUES;
798:         idx = (PetscInt *)iidx;
799:       }
800:       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
801:       PetscCall(ISSetPermutation(is));
802:       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
803:       a->h2opus_indexmap = is;
804:     }
805:     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
806:     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
807:     rid = (PetscBool)(n == A->rmap->n);
808:     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
809:     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
810:     if (!rid) {
811:       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
812:         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
813:         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
814:         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
815:         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
816:       }
817:       PetscCall(PetscSFCreate(comm, &a->sf));
818:       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
819:       PetscCall(PetscSFSetUp(a->sf));
820:       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
821:   #if defined(PETSC_H2OPUS_USE_GPU)
822:       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
823:       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
824:       a->xxs_gpu = 1;
825:       a->yys_gpu = 1;
826:   #endif
827:       a->xx  = new thrust::host_vector<PetscScalar>(n);
828:       a->yy  = new thrust::host_vector<PetscScalar>(n);
829:       a->xxs = 1;
830:       a->yys = 1;
831:     }
832:     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
833:   }
834:   a->multsetup = PETSC_TRUE;
835:   PetscFunctionReturn(PETSC_SUCCESS);
836: }

838: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
839: {
840:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
841:   #if defined(H2OPUS_USE_MPI)
842:   h2opusHandle_t handle = a->handle->handle;
843:   #else
844:   h2opusHandle_t handle = a->handle;
845:   #endif
846:   PetscBool   kernel       = PETSC_FALSE;
847:   PetscBool   boundtocpu   = PETSC_TRUE;
848:   PetscBool   samplingdone = PETSC_FALSE;
849:   MPI_Comm    comm;
850:   PetscMPIInt size;

852:   PetscFunctionBegin;
853:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
854:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
855:   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");

857:   /* XXX */
858:   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));

860:   PetscCallMPI(MPI_Comm_size(comm, &size));
861:   /* TODO REUSABILITY of geometric construction */
862:   delete a->hmatrix;
863:   delete a->dist_hmatrix;
864:   #if defined(PETSC_H2OPUS_USE_GPU)
865:   delete a->hmatrix_gpu;
866:   delete a->dist_hmatrix_gpu;
867:   #endif
868:   a->orthogonal = PETSC_FALSE;

870:   /* TODO: other? */
871:   H2OpusBoxCenterAdmissibility adm(a->eta);

873:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
874:   if (size > 1) {
875:   #if defined(H2OPUS_USE_MPI)
876:     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
877:   #else
878:     a->dist_hmatrix = NULL;
879:   #endif
880:   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
881:   PetscCall(MatH2OpusInferCoordinates_Private(A));
882:   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
883:   if (a->kernel) {
884:     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
885:     if (size > 1) {
886:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
887:   #if defined(H2OPUS_USE_MPI)
888:       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
889:   #endif
890:     } else {
891:       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
892:     }
893:     kernel = PETSC_TRUE;
894:   } else {
895:     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
896:     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
897:   }
898:   PetscCall(MatSetUpMultiply_H2OPUS(A));

900:   #if defined(PETSC_H2OPUS_USE_GPU)
901:   boundtocpu = A->boundtocpu;
902:   if (!boundtocpu) {
903:     if (size > 1) {
904:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
905:     #if defined(H2OPUS_USE_MPI)
906:       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
907:     #endif
908:     } else {
909:       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
910:     }
911:   }
912:   #endif
913:   if (size == 1) {
914:     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
915:       PetscReal Anorm;
916:       bool      verbose;

918:       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
919:       verbose = a->hara_verbose;
920:       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
921:       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
922:       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
923:       a->sampler->SetStream(handle->getMainStream());
924:       if (boundtocpu) {
925:         a->sampler->SetGPUSampling(false);
926:         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
927:   #if defined(PETSC_H2OPUS_USE_GPU)
928:       } else {
929:         a->sampler->SetGPUSampling(true);
930:         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
931:   #endif
932:       }
933:       samplingdone = PETSC_TRUE;
934:     }
935:   }
936:   #if defined(PETSC_H2OPUS_USE_GPU)
937:   if (!boundtocpu) {
938:     delete a->hmatrix;
939:     delete a->dist_hmatrix;
940:     a->hmatrix      = NULL;
941:     a->dist_hmatrix = NULL;
942:   }
943:   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944:   #endif
945:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));

947:   if (!a->s) a->s = 1.0;
948:   A->assembled = PETSC_TRUE;

950:   if (samplingdone) {
951:     PetscBool check  = a->check_construction;
952:     PetscBool checke = PETSC_FALSE;

954:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
955:     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
956:     if (check) {
957:       Mat       E, Ae;
958:       PetscReal n1, ni, n2;
959:       PetscReal n1A, niA, n2A;
960:       void (*normfunc)(void);

962:       Ae = a->sampler->GetSamplingMat();
963:       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
964:       PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
965:       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
966:       PetscCall(MatNorm(E, NORM_1, &n1));
967:       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
968:       PetscCall(MatNorm(E, NORM_2, &n2));
969:       if (checke) {
970:         Mat eA, eE, eAe;

972:         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
973:         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
974:         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
975:         PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976:         PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
977:         PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
978:         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
979:         PetscCall(MatView(eA, NULL));
980:         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
981:         PetscCall(MatView(eAe, NULL));
982:         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
983:         PetscCall(MatView(eE, NULL));
984:         PetscCall(MatDestroy(&eA));
985:         PetscCall(MatDestroy(&eE));
986:         PetscCall(MatDestroy(&eAe));
987:       }

989:       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
990:       PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
991:       PetscCall(MatNorm(Ae, NORM_1, &n1A));
992:       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
993:       PetscCall(MatNorm(Ae, NORM_2, &n2A));
994:       n1A = PetscMax(n1A, PETSC_SMALL);
995:       n2A = PetscMax(n2A, PETSC_SMALL);
996:       niA = PetscMax(niA, PETSC_SMALL);
997:       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
998:       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
999:       PetscCall(MatDestroy(&E));
1000:     }
1001:     a->sampler->SetSamplingMat(NULL);
1002:   }
1003:   PetscFunctionReturn(PETSC_SUCCESS);
1004: }

1006: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007: {
1008:   PetscMPIInt size;
1009:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1011:   PetscFunctionBegin;
1012:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1013:   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1014:   a->hmatrix->clearData();
1015:   #if defined(PETSC_H2OPUS_USE_GPU)
1016:   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017:   #endif
1018:   PetscFunctionReturn(PETSC_SUCCESS);
1019: }

1021: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1022: {
1023:   Mat         A;
1024:   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1025:   #if defined(PETSC_H2OPUS_USE_GPU)
1026:   PetscBool iscpu = PETSC_FALSE;
1027:   #else
1028:   PetscBool iscpu = PETSC_TRUE;
1029:   #endif
1030:   MPI_Comm comm;

1032:   PetscFunctionBegin;
1033:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1034:   PetscCall(MatCreate(comm, &A));
1035:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1036:   PetscCall(MatSetType(A, MATH2OPUS));
1037:   PetscCall(MatPropagateSymmetryOptions(B, A));
1038:   a = (Mat_H2OPUS *)A->data;

1040:   a->eta              = b->eta;
1041:   a->leafsize         = b->leafsize;
1042:   a->basisord         = b->basisord;
1043:   a->max_rank         = b->max_rank;
1044:   a->bs               = b->bs;
1045:   a->rtol             = b->rtol;
1046:   a->norm_max_samples = b->norm_max_samples;
1047:   if (op == MAT_COPY_VALUES) a->s = b->s;

1049:   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1050:   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);

1052:   #if defined(H2OPUS_USE_MPI)
1053:   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1054:     #if defined(PETSC_H2OPUS_USE_GPU)
1055:   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1056:     #endif
1057:   #endif
1058:   if (b->hmatrix) {
1059:     a->hmatrix = new HMatrix(*b->hmatrix);
1060:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1061:   }
1062:   #if defined(PETSC_H2OPUS_USE_GPU)
1063:   if (b->hmatrix_gpu) {
1064:     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1065:     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1066:   }
1067:   #endif
1068:   if (b->sf) {
1069:     PetscCall(PetscObjectReference((PetscObject)b->sf));
1070:     a->sf = b->sf;
1071:   }
1072:   if (b->h2opus_indexmap) {
1073:     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1074:     a->h2opus_indexmap = b->h2opus_indexmap;
1075:   }

1077:   PetscCall(MatSetUp(A));
1078:   PetscCall(MatSetUpMultiply_H2OPUS(A));
1079:   if (op == MAT_COPY_VALUES) {
1080:     A->assembled  = PETSC_TRUE;
1081:     a->orthogonal = b->orthogonal;
1082:   #if defined(PETSC_H2OPUS_USE_GPU)
1083:     A->offloadmask = B->offloadmask;
1084:   #endif
1085:   }
1086:   #if defined(PETSC_H2OPUS_USE_GPU)
1087:   iscpu = B->boundtocpu;
1088:   #endif
1089:   PetscCall(MatBindToCPU(A, iscpu));

1091:   *nA = A;
1092:   PetscFunctionReturn(PETSC_SUCCESS);
1093: }

1095: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1096: {
1097:   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1098:   PetscBool         isascii, vieweps;
1099:   PetscMPIInt       size;
1100:   PetscViewerFormat format;

1102:   PetscFunctionBegin;
1103:   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1104:   PetscCall(PetscViewerGetFormat(view, &format));
1105:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1106:   if (isascii) {
1107:     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1108:       if (size == 1) {
1109:         FILE *fp;
1110:         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1111:         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1112:       }
1113:     } else {
1114:       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1115:       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1116:       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1117:       if (!h2opus->kernel) {
1118:         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1119:       } else {
1120:         PetscCall(PetscViewerASCIIPrintf(view, "  Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1121:       }
1122:       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1123:       if (size == 1) {
1124:         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1125:         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1126:   #if defined(PETSC_HAVE_CUDA)
1127:         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1128:         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1129:   #endif
1130:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1131:   #if defined(PETSC_HAVE_CUDA)
1132:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1133:   #endif
1134:       } else {
1135:   #if defined(PETSC_HAVE_CUDA)
1136:         double      matrix_mem[4] = {0., 0., 0., 0.};
1137:         PetscMPIInt rsize         = 4;
1138:   #else
1139:         double      matrix_mem[2] = {0., 0.};
1140:         PetscMPIInt rsize         = 2;
1141:   #endif
1142:   #if defined(H2OPUS_USE_MPI)
1143:         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1144:         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1145:     #if defined(PETSC_HAVE_CUDA)
1146:         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1147:         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1148:     #endif
1149:   #endif
1150:         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1151:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1152:   #if defined(PETSC_HAVE_CUDA)
1153:         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1154:   #endif
1155:       }
1156:     }
1157:   }
1158:   vieweps = PETSC_FALSE;
1159:   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1160:   if (vieweps) {
1161:     char        filename[256];
1162:     const char *name;

1164:     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1165:     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1166:     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1167:     outputEps(*h2opus->hmatrix, filename);
1168:   }
1169:   PetscFunctionReturn(PETSC_SUCCESS);
1170: }

1172: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1173: {
1174:   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1175:   PetscReal  *gcoords;
1176:   PetscInt    N;
1177:   MPI_Comm    comm;
1178:   PetscMPIInt size;
1179:   PetscBool   cong;

1181:   PetscFunctionBegin;
1182:   PetscCall(PetscLayoutSetUp(A->rmap));
1183:   PetscCall(PetscLayoutSetUp(A->cmap));
1184:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1185:   PetscCall(MatHasCongruentLayouts(A, &cong));
1186:   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1187:   N = A->rmap->N;
1188:   PetscCallMPI(MPI_Comm_size(comm, &size));
1189:   if (spacedim > 0 && size > 1 && cdist) {
1190:     PetscSF      sf;
1191:     MPI_Datatype dtype;

1193:     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1194:     PetscCallMPI(MPI_Type_commit(&dtype));

1196:     PetscCall(PetscSFCreate(comm, &sf));
1197:     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1198:     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1199:     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1200:     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1201:     PetscCall(PetscSFDestroy(&sf));
1202:     PetscCallMPI(MPI_Type_free(&dtype));
1203:   } else gcoords = (PetscReal *)coords;

1205:   delete h2opus->ptcloud;
1206:   delete h2opus->kernel;
1207:   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1208:   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1209:   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1210:   A->preallocated = PETSC_TRUE;
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214:   #if defined(PETSC_H2OPUS_USE_GPU)
1215: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1216: {
1217:   PetscMPIInt size;
1218:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1220:   PetscFunctionBegin;
1221:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1222:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1223:     if (size > 1) {
1224:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225:     #if defined(H2OPUS_USE_MPI)
1226:       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1227:       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1228:     #endif
1229:     } else {
1230:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1231:       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1232:       else *a->hmatrix = *a->hmatrix_gpu;
1233:     }
1234:     delete a->hmatrix_gpu;
1235:     delete a->dist_hmatrix_gpu;
1236:     a->hmatrix_gpu      = NULL;
1237:     a->dist_hmatrix_gpu = NULL;
1238:     A->offloadmask      = PETSC_OFFLOAD_CPU;
1239:   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1240:     if (size > 1) {
1241:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242:     #if defined(H2OPUS_USE_MPI)
1243:       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1244:       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1245:     #endif
1246:     } else {
1247:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1248:       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1249:       else *a->hmatrix_gpu = *a->hmatrix;
1250:     }
1251:     delete a->hmatrix;
1252:     delete a->dist_hmatrix;
1253:     a->hmatrix      = NULL;
1254:     a->dist_hmatrix = NULL;
1255:     A->offloadmask  = PETSC_OFFLOAD_GPU;
1256:   }
1257:   PetscCall(PetscFree(A->defaultvectype));
1258:   if (!flg) {
1259:     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1260:   } else {
1261:     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1262:   }
1263:   A->boundtocpu = flg;
1264:   PetscFunctionReturn(PETSC_SUCCESS);
1265: }
1266:   #endif

1268: /*MC
1269:    MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.

1271:    Options Database Key:
1272: .  -mat_type h2opus - matrix type to "h2opus"

1274:    Level: beginner

1276:    Notes:
1277:    H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.

1279:    For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1280:    In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.

1282: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1283: M*/
1284: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1285: {
1286:   Mat_H2OPUS *a;
1287:   PetscMPIInt size;

1289:   PetscFunctionBegin;
1290:   #if defined(PETSC_H2OPUS_USE_GPU)
1291:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1292:   #endif
1293:   PetscCall(PetscNew(&a));
1294:   A->data = (void *)a;

1296:   a->eta              = 0.9;
1297:   a->leafsize         = 32;
1298:   a->basisord         = 4;
1299:   a->max_rank         = 64;
1300:   a->bs               = 32;
1301:   a->rtol             = 1.e-4;
1302:   a->s                = 1.0;
1303:   a->norm_max_samples = 10;
1304:   a->resize           = PETSC_TRUE; /* reallocate after compression */
1305:   #if defined(H2OPUS_USE_MPI)
1306:   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1307:   #else
1308:   h2opusCreateHandle(&a->handle);
1309:   #endif
1310:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1311:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1312:   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));

1314:   A->ops->destroy          = MatDestroy_H2OPUS;
1315:   A->ops->view             = MatView_H2OPUS;
1316:   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1317:   A->ops->mult             = MatMult_H2OPUS;
1318:   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1319:   A->ops->multadd          = MatMultAdd_H2OPUS;
1320:   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1321:   A->ops->scale            = MatScale_H2OPUS;
1322:   A->ops->duplicate        = MatDuplicate_H2OPUS;
1323:   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1324:   A->ops->norm             = MatNorm_H2OPUS;
1325:   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1326:   #if defined(PETSC_H2OPUS_USE_GPU)
1327:   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1328:   #endif

1330:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1331:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1332:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1333:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1334:   #if defined(PETSC_H2OPUS_USE_GPU)
1335:   PetscCall(PetscFree(A->defaultvectype));
1336:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1337:   #endif
1338:   PetscFunctionReturn(PETSC_SUCCESS);
1339: }

1341: /*@C
1342:   MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.

1344:   Input Parameter:
1345: . A - the matrix

1347:   Level: intermediate

1349: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1350: @*/
1351: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1352: {
1353:   PetscBool   ish2opus;
1354:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1355:   PetscMPIInt size;
1356:   PetscBool   boundtocpu = PETSC_TRUE;

1358:   PetscFunctionBegin;
1361:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1362:   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1363:   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1364:   HLibProfile::clear();
1365:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1366:   #if defined(PETSC_H2OPUS_USE_GPU)
1367:   boundtocpu = A->boundtocpu;
1368:   #endif
1369:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1370:   if (size > 1) {
1371:     if (boundtocpu) {
1372:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1373:   #if defined(H2OPUS_USE_MPI)
1374:       distributed_horthog(*a->dist_hmatrix, a->handle);
1375:   #endif
1376:   #if defined(PETSC_H2OPUS_USE_GPU)
1377:       A->offloadmask = PETSC_OFFLOAD_CPU;
1378:     } else {
1379:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1380:       PetscCall(PetscLogGpuTimeBegin());
1381:     #if defined(H2OPUS_USE_MPI)
1382:       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1383:     #endif
1384:       PetscCall(PetscLogGpuTimeEnd());
1385:   #endif
1386:     }
1387:   } else {
1388:   #if defined(H2OPUS_USE_MPI)
1389:     h2opusHandle_t handle = a->handle->handle;
1390:   #else
1391:     h2opusHandle_t handle = a->handle;
1392:   #endif
1393:     if (boundtocpu) {
1394:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1395:       horthog(*a->hmatrix, handle);
1396:   #if defined(PETSC_H2OPUS_USE_GPU)
1397:       A->offloadmask = PETSC_OFFLOAD_CPU;
1398:     } else {
1399:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1400:       PetscCall(PetscLogGpuTimeBegin());
1401:       horthog(*a->hmatrix_gpu, handle);
1402:       PetscCall(PetscLogGpuTimeEnd());
1403:   #endif
1404:     }
1405:   }
1406:   a->orthogonal = PETSC_TRUE;
1407:   { /* log flops */
1408:     double gops, time, perf, dev;
1409:     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1410:   #if defined(PETSC_H2OPUS_USE_GPU)
1411:     if (boundtocpu) {
1412:       PetscCall(PetscLogFlops(1e9 * gops));
1413:     } else {
1414:       PetscCall(PetscLogGpuFlops(1e9 * gops));
1415:     }
1416:   #else
1417:     PetscCall(PetscLogFlops(1e9 * gops));
1418:   #endif
1419:   }
1420:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1421:   PetscFunctionReturn(PETSC_SUCCESS);
1422: }

1424: /*@C
1425:   MatH2OpusCompress - Compress a hierarchical matrix.

1427:   Input Parameters:
1428: + A   - the matrix
1429: - tol - the absolute truncation threshold

1431:   Level: intermediate

1433: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1434: @*/
1435: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1436: {
1437:   PetscBool   ish2opus;
1438:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1439:   PetscMPIInt size;
1440:   PetscBool   boundtocpu = PETSC_TRUE;

1442:   PetscFunctionBegin;
1446:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1447:   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1448:   PetscCall(MatH2OpusOrthogonalize(A));
1449:   HLibProfile::clear();
1450:   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1451:   #if defined(PETSC_H2OPUS_USE_GPU)
1452:   boundtocpu = A->boundtocpu;
1453:   #endif
1454:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1455:   if (size > 1) {
1456:     if (boundtocpu) {
1457:       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1458:   #if defined(H2OPUS_USE_MPI)
1459:       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1460:       if (a->resize) {
1461:         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1462:         delete a->dist_hmatrix;
1463:         a->dist_hmatrix = dist_hmatrix;
1464:       }
1465:   #endif
1466:   #if defined(PETSC_H2OPUS_USE_GPU)
1467:       A->offloadmask = PETSC_OFFLOAD_CPU;
1468:     } else {
1469:       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1470:       PetscCall(PetscLogGpuTimeBegin());
1471:     #if defined(H2OPUS_USE_MPI)
1472:       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);

1474:       if (a->resize) {
1475:         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1476:         delete a->dist_hmatrix_gpu;
1477:         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1478:       }
1479:     #endif
1480:       PetscCall(PetscLogGpuTimeEnd());
1481:   #endif
1482:     }
1483:   } else {
1484:   #if defined(H2OPUS_USE_MPI)
1485:     h2opusHandle_t handle = a->handle->handle;
1486:   #else
1487:     h2opusHandle_t handle = a->handle;
1488:   #endif
1489:     if (boundtocpu) {
1490:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1491:       hcompress(*a->hmatrix, tol, handle);

1493:       if (a->resize) {
1494:         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1495:         delete a->hmatrix;
1496:         a->hmatrix = hmatrix;
1497:       }
1498:   #if defined(PETSC_H2OPUS_USE_GPU)
1499:       A->offloadmask = PETSC_OFFLOAD_CPU;
1500:     } else {
1501:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1502:       PetscCall(PetscLogGpuTimeBegin());
1503:       hcompress(*a->hmatrix_gpu, tol, handle);
1504:       PetscCall(PetscLogGpuTimeEnd());

1506:       if (a->resize) {
1507:         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1508:         delete a->hmatrix_gpu;
1509:         a->hmatrix_gpu = hmatrix_gpu;
1510:       }
1511:   #endif
1512:     }
1513:   }
1514:   { /* log flops */
1515:     double gops, time, perf, dev;
1516:     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1517:   #if defined(PETSC_H2OPUS_USE_GPU)
1518:     if (boundtocpu) {
1519:       PetscCall(PetscLogFlops(1e9 * gops));
1520:     } else {
1521:       PetscCall(PetscLogGpuFlops(1e9 * gops));
1522:     }
1523:   #else
1524:     PetscCall(PetscLogFlops(1e9 * gops));
1525:   #endif
1526:   }
1527:   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1528:   PetscFunctionReturn(PETSC_SUCCESS);
1529: }

1531: /*@C
1532:   MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.

1534:   Input Parameters:
1535: + A   - the hierarchical matrix
1536: . B   - the matrix to be sampled
1537: . bs  - maximum number of samples to be taken concurrently
1538: - tol - relative tolerance for construction

1540:   Level: intermediate

1542:   Notes:
1543:   You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.

1545: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1546: @*/
1547: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1548: {
1549:   PetscBool ish2opus;

1551:   PetscFunctionBegin;
1557:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1558:   if (ish2opus) {
1559:     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1561:     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1562:     a->sampler->SetSamplingMat(B);
1563:     if (bs > 0) a->bs = bs;
1564:     if (tol > 0.) a->rtol = tol;
1565:     delete a->kernel;
1566:   }
1567:   PetscFunctionReturn(PETSC_SUCCESS);
1568: }

1570: /*@C
1571:   MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.

1573:   Input Parameters:
1574: + comm      - MPI communicator
1575: . m         - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1576: . n         - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1577: . M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1578: . N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1579: . spacedim  - dimension of the space coordinates
1580: . coords    - coordinates of the points
1581: . cdist     - whether or not coordinates are distributed
1582: . kernel    - computational kernel (or `NULL`)
1583: . kernelctx - kernel context
1584: . eta       - admissibility condition tolerance
1585: . leafsize  - leaf size in cluster tree
1586: - basisord  - approximation order for Chebychev interpolation of low-rank blocks

1588:   Output Parameter:
1589: . nA - matrix

1591:   Options Database Keys:
1592: + -mat_h2opus_leafsize <`PetscInt`>    - Leaf size of cluster tree
1593: . -mat_h2opus_eta <`PetscReal`>        - Admissibility condition tolerance
1594: . -mat_h2opus_order <`PetscInt`>       - Chebychev approximation order
1595: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms

1597:   Level: intermediate

1599: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1600: @*/
1601: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1602: {
1603:   Mat         A;
1604:   Mat_H2OPUS *h2opus;
1605:   #if defined(PETSC_H2OPUS_USE_GPU)
1606:   PetscBool iscpu = PETSC_FALSE;
1607:   #else
1608:   PetscBool iscpu = PETSC_TRUE;
1609:   #endif

1611:   PetscFunctionBegin;
1612:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1613:   PetscCall(MatCreate(comm, &A));
1614:   PetscCall(MatSetSizes(A, m, n, M, N));
1615:   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1616:   PetscCall(MatSetType(A, MATH2OPUS));
1617:   PetscCall(MatBindToCPU(A, iscpu));
1618:   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));

1620:   h2opus = (Mat_H2OPUS *)A->data;
1621:   if (eta > 0.) h2opus->eta = eta;
1622:   if (leafsize > 0) h2opus->leafsize = leafsize;
1623:   if (basisord > 0) h2opus->basisord = basisord;

1625:   *nA = A;
1626:   PetscFunctionReturn(PETSC_SUCCESS);
1627: }

1629: /*@C
1630:   MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.

1632:   Input Parameters:
1633: + B        - the matrix to be sampled
1634: . spacedim - dimension of the space coordinates
1635: . coords   - coordinates of the points
1636: . cdist    - whether or not coordinates are distributed
1637: . eta      - admissibility condition tolerance
1638: . leafsize - leaf size in cluster tree
1639: . maxrank  - maximum rank allowed
1640: . bs       - maximum number of samples to be taken concurrently
1641: - rtol     - relative tolerance for construction

1643:   Output Parameter:
1644: . nA - matrix

1646:   Options Database Keys:
1647: + -mat_h2opus_leafsize <`PetscInt`>      - Leaf size of cluster tree
1648: . -mat_h2opus_eta <`PetscReal`>          - Admissibility condition tolerance
1649: . -mat_h2opus_maxrank <`PetscInt`>       - Maximum rank when constructed from matvecs
1650: . -mat_h2opus_samples <`PetscInt`>       - Maximum number of samples to be taken concurrently when constructing from matvecs
1651: . -mat_h2opus_rtol <`PetscReal`>         - Relative tolerance for construction from sampling
1652: . -mat_h2opus_check <`PetscBool`>        - Check error when constructing from sampling during MatAssemblyEnd()
1653: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1654: - -mat_h2opus_normsamples <`PetscInt`>   - Maximum number of samples to be when estimating norms

1656:   Level: intermediate

1658:   Note:
1659:   Not available in parallel

1661: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1662: @*/
1663: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1664: {
1665:   Mat         A;
1666:   Mat_H2OPUS *h2opus;
1667:   MPI_Comm    comm;
1668:   PetscBool   boundtocpu = PETSC_TRUE;

1670:   PetscFunctionBegin;
1679:   PetscAssertPointer(nA, 10);
1680:   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1681:   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1682:   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1683:   PetscCall(MatCreate(comm, &A));
1684:   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1685:   #if defined(PETSC_H2OPUS_USE_GPU)
1686:   {
1687:     VecType   vtype;
1688:     PetscBool isstd, iscuda, iskok;

1690:     PetscCall(MatGetVecType(B, &vtype));
1691:     PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1692:     PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1693:     PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1694:     PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1695:     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1696:     if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1697:   }
1698:   #endif
1699:   PetscCall(MatSetType(A, MATH2OPUS));
1700:   PetscCall(MatBindToCPU(A, boundtocpu));
1701:   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1702:   PetscCall(MatPropagateSymmetryOptions(B, A));
1703:   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */

1705:   h2opus          = (Mat_H2OPUS *)A->data;
1706:   h2opus->sampler = new PetscMatrixSampler(B);
1707:   if (eta > 0.) h2opus->eta = eta;
1708:   if (leafsize > 0) h2opus->leafsize = leafsize;
1709:   if (maxrank > 0) h2opus->max_rank = maxrank;
1710:   if (bs > 0) h2opus->bs = bs;
1711:   if (rtol > 0.) h2opus->rtol = rtol;
1712:   *nA             = A;
1713:   A->preallocated = PETSC_TRUE;
1714:   PetscFunctionReturn(PETSC_SUCCESS);
1715: }

1717: /*@C
1718:   MatH2OpusGetIndexMap - Access reordering index set.

1720:   Input Parameter:
1721: . A - the matrix

1723:   Output Parameter:
1724: . indexmap - the index set for the reordering

1726:   Level: intermediate

1728: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1729: @*/
1730: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1731: {
1732:   PetscBool   ish2opus;
1733:   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;

1735:   PetscFunctionBegin;
1738:   PetscAssertPointer(indexmap, 2);
1739:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1740:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1741:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1742:   *indexmap = a->h2opus_indexmap;
1743:   PetscFunctionReturn(PETSC_SUCCESS);
1744: }

1746: /*@C
1747:   MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering

1749:   Input Parameters:
1750: + A             - the matrix
1751: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1752: - in            - the vector to be mapped

1754:   Output Parameter:
1755: . out - the newly created mapped vector

1757:   Level: intermediate

1759: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1760: @*/
1761: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1762: {
1763:   PetscBool    ish2opus;
1764:   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1765:   PetscScalar *xin, *xout;
1766:   PetscBool    nm;

1768:   PetscFunctionBegin;
1773:   PetscAssertPointer(out, 4);
1774:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1775:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1776:   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1777:   nm = a->nativemult;
1778:   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1779:   PetscCall(MatCreateVecs(A, out, NULL));
1780:   PetscCall(MatH2OpusSetNativeMult(A, nm));
1781:   if (!a->sf) { /* same ordering */
1782:     PetscCall(VecCopy(in, *out));
1783:     PetscFunctionReturn(PETSC_SUCCESS);
1784:   }
1785:   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1786:   PetscCall(VecGetArrayWrite(*out, &xout));
1787:   if (nativetopetsc) {
1788:     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1789:     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1790:   } else {
1791:     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792:     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1793:   }
1794:   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1795:   PetscCall(VecRestoreArrayWrite(*out, &xout));
1796:   PetscFunctionReturn(PETSC_SUCCESS);
1797: }

1799: /*@C
1800:   MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $

1802:   Input Parameters:
1803: + A - the hierarchical `MATH2OPUS` matrix
1804: . s - the scaling factor
1805: . U - the dense low-rank update matrix
1806: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)

1808:   Note:
1809:   The `U` and `V` matrices must be in `MATDENSE` dense format

1811:   Level: intermediate

1813: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1814: @*/
1815: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1816: {
1817:   PetscBool flg;

1819:   PetscFunctionBegin;
1822:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1824:   PetscCheckSameComm(A, 1, U, 2);
1825:   if (V) {
1827:     PetscCheckSameComm(A, 1, V, 3);
1828:   }

1831:   if (!V) V = U;
1832:   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1833:   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1834:   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1835:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1836:   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1837:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1838:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1839:   if (flg) {
1840:     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1841:     const PetscScalar *u, *v, *uu, *vv;
1842:     PetscInt           ldu, ldv;
1843:     PetscMPIInt        size;
1844:   #if defined(H2OPUS_USE_MPI)
1845:     h2opusHandle_t handle = a->handle->handle;
1846:   #else
1847:     h2opusHandle_t handle = a->handle;
1848:   #endif
1849:     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1850:     PetscSF   usf, vsf;

1852:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1853:     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1854:     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1855:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1856:     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1857:     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1858:     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1859:     PetscCall(MatDenseGetLDA(U, &ldu));
1860:     PetscCall(MatDenseGetLDA(V, &ldv));
1861:     PetscCall(MatBoundToCPU(A, &flg));
1862:     if (usesf) {
1863:       PetscInt n;

1865:       PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf));
1866:       PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf));
1867:       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1868:       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1869:       ldu = n;
1870:       ldv = n;
1871:     }
1872:     if (flg) {
1873:       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1874:       PetscCall(MatDenseGetArrayRead(U, &u));
1875:       PetscCall(MatDenseGetArrayRead(V, &v));
1876:       if (usesf) {
1877:         vv = MatH2OpusGetThrustPointer(*a->yy);
1878:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1879:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1880:         if (U != V) {
1881:           uu = MatH2OpusGetThrustPointer(*a->xx);
1882:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1883:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1884:         } else uu = vv;
1885:       } else {
1886:         uu = u;
1887:         vv = v;
1888:       }
1889:       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1890:       PetscCall(MatDenseRestoreArrayRead(U, &u));
1891:       PetscCall(MatDenseRestoreArrayRead(V, &v));
1892:     } else {
1893:   #if defined(PETSC_H2OPUS_USE_GPU)
1894:       PetscBool flgU, flgV;

1896:       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1897:       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1898:       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1899:       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1900:       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1901:       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1902:       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1903:       if (usesf) {
1904:         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1905:         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1906:         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1907:         if (U != V) {
1908:           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1909:           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1910:           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1911:         } else uu = vv;
1912:       } else {
1913:         uu = u;
1914:         vv = v;
1915:       }
1916:   #else
1917:       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1918:   #endif
1919:       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1920:   #if defined(PETSC_H2OPUS_USE_GPU)
1921:       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1922:       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1923:       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1924:       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1925:   #endif
1926:     }
1927:     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1928:     a->orthogonal = PETSC_FALSE;
1929:   }
1930:   PetscFunctionReturn(PETSC_SUCCESS);
1931: }
1932: #endif