Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petsc/private/deviceimpl.h>
16: #include <petscsf.h>
18: /* math2opusutils */
19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
21: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
25: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
27: /* Use GPU only if H2OPUS is configured for GPU */
28: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29: #define PETSC_H2OPUS_USE_GPU
30: #endif
31: #if defined(PETSC_H2OPUS_USE_GPU)
32: #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
33: #else
34: #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
35: #endif
37: // TODO H2OPUS:
38: // DistributedHMatrix
39: // unsymmetric ?
40: // transpose for distributed_hgemv?
41: // clearData()
42: // Unify interface for sequential and parallel?
43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44: //
45: template <class T>
46: class PetscPointCloud : public H2OpusDataSet<T> {
47: private:
48: int dimension;
49: size_t num_points;
50: std::vector<T> pts;
52: public:
53: PetscPointCloud(int dim, size_t num_pts, const T coords[])
54: {
55: dim = dim > 0 ? dim : 1;
56: this->dimension = dim;
57: this->num_points = num_pts;
59: pts.resize(num_pts * dim);
60: if (coords) {
61: for (size_t n = 0; n < num_pts; n++)
62: for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
63: } else {
64: PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
65: for (size_t n = 0; n < num_pts; n++) {
66: pts[n * dim] = n * h;
67: for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
68: }
69: }
70: }
72: PetscPointCloud(const PetscPointCloud<T> &other)
73: {
74: size_t N = other.dimension * other.num_points;
75: this->dimension = other.dimension;
76: this->num_points = other.num_points;
77: this->pts.resize(N);
78: for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
79: }
81: int getDimension() const { return dimension; }
83: size_t getDataSetSize() const { return num_points; }
85: T getDataPoint(size_t idx, int dim) const
86: {
87: assert(dim < dimension && idx < num_points);
88: return pts[idx * dimension + dim];
89: }
91: void Print(std::ostream &out = std::cout)
92: {
93: out << "Dimension: " << dimension << std::endl;
94: out << "NumPoints: " << num_points << std::endl;
95: for (size_t n = 0; n < num_points; n++) {
96: for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
97: out << std::endl;
98: }
99: }
100: };
102: template <class T>
103: class PetscFunctionGenerator {
104: private:
105: MatH2OpusKernel k;
106: int dim;
107: void *ctx;
109: public:
110: PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111: {
112: this->k = k;
113: this->dim = dim;
114: this->ctx = ctx;
115: }
116: PetscFunctionGenerator(PetscFunctionGenerator &other)
117: {
118: this->k = other.k;
119: this->dim = other.dim;
120: this->ctx = other.ctx;
121: }
122: T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123: };
125: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
127: /* just to not clutter the code */
128: #if !defined(H2OPUS_USE_GPU)
129: typedef HMatrix HMatrix_GPU;
130: #if defined(H2OPUS_USE_MPI)
131: typedef DistributedHMatrix DistributedHMatrix_GPU;
132: #endif
133: #endif
135: typedef struct {
136: #if defined(H2OPUS_USE_MPI)
137: distributedH2OpusHandle_t handle;
138: #else
139: h2opusHandle_t handle;
140: #endif
141: /* Sequential and parallel matrices are two different classes at the moment */
142: HMatrix *hmatrix;
143: #if defined(H2OPUS_USE_MPI)
144: DistributedHMatrix *dist_hmatrix;
145: #else
146: HMatrix *dist_hmatrix; /* just to not clutter the code */
147: #endif
148: /* May use permutations */
149: PetscSF sf;
150: PetscLayout h2opus_rmap, h2opus_cmap;
151: IS h2opus_indexmap;
152: thrust::host_vector<PetscScalar> *xx, *yy;
153: PetscInt xxs, yys;
154: PetscBool multsetup;
156: /* GPU */
157: HMatrix_GPU *hmatrix_gpu;
158: #if defined(H2OPUS_USE_MPI)
159: DistributedHMatrix_GPU *dist_hmatrix_gpu;
160: #else
161: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
162: #endif
163: #if defined(PETSC_H2OPUS_USE_GPU)
164: thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165: PetscInt xxs_gpu, yys_gpu;
166: #endif
168: /* construction from matvecs */
169: PetscMatrixSampler *sampler;
170: PetscBool nativemult;
172: /* Admissibility */
173: PetscReal eta;
174: PetscInt leafsize;
176: /* for dof reordering */
177: PetscPointCloud<PetscReal> *ptcloud;
179: /* kernel for generating matrix entries */
180: PetscFunctionGenerator<PetscScalar> *kernel;
182: /* basis orthogonalized? */
183: PetscBool orthogonal;
185: /* customization */
186: PetscInt basisord;
187: PetscInt max_rank;
188: PetscInt bs;
189: PetscReal rtol;
190: PetscInt norm_max_samples;
191: PetscBool check_construction;
192: PetscBool hara_verbose;
193: PetscBool resize;
195: /* keeps track of MatScale values */
196: PetscScalar s;
197: } Mat_H2OPUS;
199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
203: PetscFunctionBegin;
204: #if defined(H2OPUS_USE_MPI)
205: h2opusDestroyDistributedHandle(a->handle);
206: #else
207: h2opusDestroyHandle(a->handle);
208: #endif
209: delete a->dist_hmatrix;
210: delete a->hmatrix;
211: PetscCall(PetscSFDestroy(&a->sf));
212: PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213: PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214: PetscCall(ISDestroy(&a->h2opus_indexmap));
215: delete a->xx;
216: delete a->yy;
217: delete a->hmatrix_gpu;
218: delete a->dist_hmatrix_gpu;
219: #if defined(PETSC_H2OPUS_USE_GPU)
220: delete a->xx_gpu;
221: delete a->yy_gpu;
222: #endif
223: delete a->sampler;
224: delete a->ptcloud;
225: delete a->kernel;
226: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
227: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
228: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
229: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
230: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
231: PetscCall(PetscFree(A->data));
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236: {
237: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
238: PetscBool ish2opus;
240: PetscFunctionBegin;
243: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
244: if (ish2opus) {
245: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247: PetscLayout t;
248: t = A->rmap;
249: A->rmap = a->h2opus_rmap;
250: a->h2opus_rmap = t;
251: t = A->cmap;
252: A->cmap = a->h2opus_cmap;
253: a->h2opus_cmap = t;
254: }
255: }
256: a->nativemult = nm;
257: }
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262: {
263: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
264: PetscBool ish2opus;
266: PetscFunctionBegin;
268: PetscAssertPointer(nm, 2);
269: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
270: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
271: *nm = a->nativemult;
272: PetscFunctionReturn(PETSC_SUCCESS);
273: }
275: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
276: {
277: PetscBool ish2opus;
278: PetscInt nmax = PETSC_DECIDE;
279: Mat_H2OPUS *a = NULL;
280: PetscBool mult = PETSC_FALSE;
282: PetscFunctionBegin;
283: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
284: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285: a = (Mat_H2OPUS *)A->data;
287: nmax = a->norm_max_samples;
288: mult = a->nativemult;
289: PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
290: } else {
291: PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
292: }
293: PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
294: if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
295: PetscFunctionReturn(PETSC_SUCCESS);
296: }
298: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299: {
300: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
301: PetscInt n;
302: PetscBool boundtocpu = PETSC_TRUE;
304: PetscFunctionBegin;
305: #if defined(PETSC_H2OPUS_USE_GPU)
306: boundtocpu = A->boundtocpu;
307: #endif
308: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
309: if (boundtocpu) {
310: if (h2opus->xxs < xN) {
311: h2opus->xx->resize(n * xN);
312: h2opus->xxs = xN;
313: }
314: if (h2opus->yys < yN) {
315: h2opus->yy->resize(n * yN);
316: h2opus->yys = yN;
317: }
318: }
319: #if defined(PETSC_H2OPUS_USE_GPU)
320: if (!boundtocpu) {
321: if (h2opus->xxs_gpu < xN) {
322: h2opus->xx_gpu->resize(n * xN);
323: h2opus->xxs_gpu = xN;
324: }
325: if (h2opus->yys_gpu < yN) {
326: h2opus->yy_gpu->resize(n * yN);
327: h2opus->yys_gpu = yN;
328: }
329: }
330: #endif
331: PetscFunctionReturn(PETSC_SUCCESS);
332: }
334: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
335: {
336: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
337: #if defined(H2OPUS_USE_MPI)
338: h2opusHandle_t handle = h2opus->handle->handle;
339: #else
340: h2opusHandle_t handle = h2opus->handle;
341: #endif
342: PetscBool boundtocpu = PETSC_TRUE;
343: PetscScalar *xx, *yy, *uxx, *uyy;
344: PetscInt blda, clda;
345: PetscMPIInt size;
346: PetscSF bsf, csf;
347: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
349: PetscFunctionBegin;
350: HLibProfile::clear();
351: #if defined(PETSC_H2OPUS_USE_GPU)
352: boundtocpu = A->boundtocpu;
353: #endif
354: PetscCall(MatDenseGetLDA(B, &blda));
355: PetscCall(MatDenseGetLDA(C, &clda));
356: if (usesf) {
357: PetscInt n;
359: PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf));
360: PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf));
362: PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
363: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
364: blda = n;
365: clda = n;
366: }
367: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
368: if (boundtocpu) {
369: PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
370: PetscCall(MatDenseGetArrayWrite(C, &yy));
371: if (usesf) {
372: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
373: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
374: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
376: } else {
377: uxx = xx;
378: uyy = yy;
379: }
380: if (size > 1) {
381: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
382: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
383: #if defined(H2OPUS_USE_MPI)
384: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
385: #endif
386: } else {
387: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
388: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
389: }
390: PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
391: if (usesf) {
392: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
394: }
395: PetscCall(MatDenseRestoreArrayWrite(C, &yy));
396: #if defined(PETSC_H2OPUS_USE_GPU)
397: } else {
398: PetscBool ciscuda, biscuda;
400: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
401: PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
402: if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
403: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
404: if (!ciscuda) {
405: C->assembled = PETSC_TRUE;
406: PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
407: }
408: PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
409: PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
410: if (usesf) {
411: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
412: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
413: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
415: } else {
416: uxx = xx;
417: uyy = yy;
418: }
419: PetscCall(PetscLogGpuTimeBegin());
420: if (size > 1) {
421: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
422: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
423: #if defined(H2OPUS_USE_MPI)
424: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
425: #endif
426: } else {
427: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
428: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
429: }
430: PetscCall(PetscLogGpuTimeEnd());
431: PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
432: if (usesf) {
433: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
435: }
436: PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
437: if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
438: if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
439: #endif
440: }
441: { /* log flops */
442: double gops, time, perf, dev;
443: HLibProfile::getHgemvPerf(gops, time, perf, dev);
444: #if defined(PETSC_H2OPUS_USE_GPU)
445: if (boundtocpu) {
446: PetscCall(PetscLogFlops(1e9 * gops));
447: } else {
448: PetscCall(PetscLogGpuFlops(1e9 * gops));
449: }
450: #else
451: PetscCall(PetscLogFlops(1e9 * gops));
452: #endif
453: }
454: PetscFunctionReturn(PETSC_SUCCESS);
455: }
457: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
458: {
459: Mat_Product *product = C->product;
461: PetscFunctionBegin;
462: MatCheckProduct(C, 1);
463: switch (product->type) {
464: case MATPRODUCT_AB:
465: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
466: break;
467: case MATPRODUCT_AtB:
468: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
469: break;
470: default:
471: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
472: }
473: PetscFunctionReturn(PETSC_SUCCESS);
474: }
476: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
477: {
478: Mat_Product *product = C->product;
479: PetscBool cisdense;
480: Mat A, B;
482: PetscFunctionBegin;
483: MatCheckProduct(C, 1);
484: A = product->A;
485: B = product->B;
486: switch (product->type) {
487: case MATPRODUCT_AB:
488: PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
489: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
490: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
491: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
492: PetscCall(MatSetUp(C));
493: break;
494: case MATPRODUCT_AtB:
495: PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
496: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
497: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
498: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
499: PetscCall(MatSetUp(C));
500: break;
501: default:
502: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
503: }
504: C->ops->productsymbolic = NULL;
505: C->ops->productnumeric = MatProductNumeric_H2OPUS;
506: PetscFunctionReturn(PETSC_SUCCESS);
507: }
509: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
510: {
511: PetscFunctionBegin;
512: MatCheckProduct(C, 1);
513: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
514: PetscFunctionReturn(PETSC_SUCCESS);
515: }
517: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
518: {
519: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
520: #if defined(H2OPUS_USE_MPI)
521: h2opusHandle_t handle = h2opus->handle->handle;
522: #else
523: h2opusHandle_t handle = h2opus->handle;
524: #endif
525: PetscBool boundtocpu = PETSC_TRUE;
526: PetscInt n;
527: PetscScalar *xx, *yy, *uxx, *uyy;
528: PetscMPIInt size;
529: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
531: PetscFunctionBegin;
532: HLibProfile::clear();
533: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
534: #if defined(PETSC_H2OPUS_USE_GPU)
535: boundtocpu = A->boundtocpu;
536: #endif
537: if (usesf) {
538: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
539: } else n = A->rmap->n;
540: if (boundtocpu) {
541: PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
542: if (sy == 0.0) {
543: PetscCall(VecGetArrayWrite(y, &yy));
544: } else {
545: PetscCall(VecGetArray(y, &yy));
546: }
547: if (usesf) {
548: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
549: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
551: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
553: if (sy != 0.0) {
554: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
556: }
557: } else {
558: uxx = xx;
559: uyy = yy;
560: }
561: if (size > 1) {
562: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
563: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
564: #if defined(H2OPUS_USE_MPI)
565: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
566: #endif
567: } else {
568: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
569: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
570: }
571: PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
572: if (usesf) {
573: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
575: }
576: if (sy == 0.0) {
577: PetscCall(VecRestoreArrayWrite(y, &yy));
578: } else {
579: PetscCall(VecRestoreArray(y, &yy));
580: }
581: #if defined(PETSC_H2OPUS_USE_GPU)
582: } else {
583: PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
584: if (sy == 0.0) {
585: PetscCall(VecCUDAGetArrayWrite(y, &yy));
586: } else {
587: PetscCall(VecCUDAGetArray(y, &yy));
588: }
589: if (usesf) {
590: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
591: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
593: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
595: if (sy != 0.0) {
596: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
598: }
599: } else {
600: uxx = xx;
601: uyy = yy;
602: }
603: PetscCall(PetscLogGpuTimeBegin());
604: if (size > 1) {
605: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
606: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
607: #if defined(H2OPUS_USE_MPI)
608: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
609: #endif
610: } else {
611: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
612: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
613: }
614: PetscCall(PetscLogGpuTimeEnd());
615: PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
616: if (usesf) {
617: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
619: }
620: if (sy == 0.0) {
621: PetscCall(VecCUDARestoreArrayWrite(y, &yy));
622: } else {
623: PetscCall(VecCUDARestoreArray(y, &yy));
624: }
625: #endif
626: }
627: { /* log flops */
628: double gops, time, perf, dev;
629: HLibProfile::getHgemvPerf(gops, time, perf, dev);
630: #if defined(PETSC_H2OPUS_USE_GPU)
631: if (boundtocpu) {
632: PetscCall(PetscLogFlops(1e9 * gops));
633: } else {
634: PetscCall(PetscLogGpuFlops(1e9 * gops));
635: }
636: #else
637: PetscCall(PetscLogFlops(1e9 * gops));
638: #endif
639: }
640: PetscFunctionReturn(PETSC_SUCCESS);
641: }
643: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
644: {
645: PetscBool xiscuda, yiscuda;
647: PetscFunctionBegin;
648: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
649: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
650: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
651: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
652: PetscFunctionReturn(PETSC_SUCCESS);
653: }
655: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656: {
657: PetscBool xiscuda, yiscuda;
659: PetscFunctionBegin;
660: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
661: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
662: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
663: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
664: PetscFunctionReturn(PETSC_SUCCESS);
665: }
667: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668: {
669: PetscBool xiscuda, ziscuda;
671: PetscFunctionBegin;
672: PetscCall(VecCopy(y, z));
673: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
674: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
675: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
676: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
677: PetscFunctionReturn(PETSC_SUCCESS);
678: }
680: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
681: {
682: PetscBool xiscuda, ziscuda;
684: PetscFunctionBegin;
685: PetscCall(VecCopy(y, z));
686: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
687: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
688: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
689: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
690: PetscFunctionReturn(PETSC_SUCCESS);
691: }
693: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
694: {
695: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
697: PetscFunctionBegin;
698: a->s *= s;
699: PetscFunctionReturn(PETSC_SUCCESS);
700: }
702: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
703: {
704: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
706: PetscFunctionBegin;
707: PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
708: PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
709: PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
710: PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
711: PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
712: PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
713: PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
714: PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
715: PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
716: PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
717: PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
718: PetscOptionsHeadEnd();
719: PetscFunctionReturn(PETSC_SUCCESS);
720: }
722: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);
724: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
725: {
726: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
727: Vec c;
728: PetscInt spacedim;
729: const PetscScalar *coords;
731: PetscFunctionBegin;
732: if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
733: PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
734: if (!c && a->sampler) {
735: Mat S = a->sampler->GetSamplingMat();
737: PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
738: }
739: if (!c) {
740: PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
741: } else {
742: PetscCall(VecGetArrayRead(c, &coords));
743: PetscCall(VecGetBlockSize(c, &spacedim));
744: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
745: PetscCall(VecRestoreArrayRead(c, &coords));
746: }
747: PetscFunctionReturn(PETSC_SUCCESS);
748: }
750: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
751: {
752: MPI_Comm comm;
753: PetscMPIInt size;
754: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
755: PetscInt n = 0, *idx = NULL;
756: int *iidx = NULL;
757: PetscCopyMode own;
758: PetscBool rid;
760: PetscFunctionBegin;
761: if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
762: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
763: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
764: #if defined(PETSC_H2OPUS_USE_GPU)
765: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
766: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
767: a->xxs_gpu = 1;
768: a->yys_gpu = 1;
769: #endif
770: a->xx = new thrust::host_vector<PetscScalar>(n);
771: a->yy = new thrust::host_vector<PetscScalar>(n);
772: a->xxs = 1;
773: a->yys = 1;
774: } else {
775: IS is;
776: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
777: PetscCallMPI(MPI_Comm_size(comm, &size));
778: if (!a->h2opus_indexmap) {
779: if (size > 1) {
780: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
781: #if defined(H2OPUS_USE_MPI)
782: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
783: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
784: #endif
785: } else {
786: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
787: n = a->hmatrix->u_basis_tree.index_map.size();
788: }
790: if (PetscDefined(USE_64BIT_INDICES)) {
791: PetscInt i;
793: own = PETSC_OWN_POINTER;
794: PetscCall(PetscMalloc1(n, &idx));
795: for (i = 0; i < n; i++) idx[i] = iidx[i];
796: } else {
797: own = PETSC_COPY_VALUES;
798: idx = (PetscInt *)iidx;
799: }
800: PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
801: PetscCall(ISSetPermutation(is));
802: PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
803: a->h2opus_indexmap = is;
804: }
805: PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
806: PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
807: rid = (PetscBool)(n == A->rmap->n);
808: PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
809: if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
810: if (!rid) {
811: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
812: PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
813: PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
814: PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
815: PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
816: }
817: PetscCall(PetscSFCreate(comm, &a->sf));
818: PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
819: PetscCall(PetscSFSetUp(a->sf));
820: PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
821: #if defined(PETSC_H2OPUS_USE_GPU)
822: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
823: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
824: a->xxs_gpu = 1;
825: a->yys_gpu = 1;
826: #endif
827: a->xx = new thrust::host_vector<PetscScalar>(n);
828: a->yy = new thrust::host_vector<PetscScalar>(n);
829: a->xxs = 1;
830: a->yys = 1;
831: }
832: PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
833: }
834: a->multsetup = PETSC_TRUE;
835: PetscFunctionReturn(PETSC_SUCCESS);
836: }
838: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
839: {
840: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
841: #if defined(H2OPUS_USE_MPI)
842: h2opusHandle_t handle = a->handle->handle;
843: #else
844: h2opusHandle_t handle = a->handle;
845: #endif
846: PetscBool kernel = PETSC_FALSE;
847: PetscBool boundtocpu = PETSC_TRUE;
848: PetscBool samplingdone = PETSC_FALSE;
849: MPI_Comm comm;
850: PetscMPIInt size;
852: PetscFunctionBegin;
853: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
854: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
855: PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
857: /* XXX */
858: a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
860: PetscCallMPI(MPI_Comm_size(comm, &size));
861: /* TODO REUSABILITY of geometric construction */
862: delete a->hmatrix;
863: delete a->dist_hmatrix;
864: #if defined(PETSC_H2OPUS_USE_GPU)
865: delete a->hmatrix_gpu;
866: delete a->dist_hmatrix_gpu;
867: #endif
868: a->orthogonal = PETSC_FALSE;
870: /* TODO: other? */
871: H2OpusBoxCenterAdmissibility adm(a->eta);
873: PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
874: if (size > 1) {
875: #if defined(H2OPUS_USE_MPI)
876: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
877: #else
878: a->dist_hmatrix = NULL;
879: #endif
880: } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
881: PetscCall(MatH2OpusInferCoordinates_Private(A));
882: PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
883: if (a->kernel) {
884: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
885: if (size > 1) {
886: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
887: #if defined(H2OPUS_USE_MPI)
888: buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
889: #endif
890: } else {
891: buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
892: }
893: kernel = PETSC_TRUE;
894: } else {
895: PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
896: buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
897: }
898: PetscCall(MatSetUpMultiply_H2OPUS(A));
900: #if defined(PETSC_H2OPUS_USE_GPU)
901: boundtocpu = A->boundtocpu;
902: if (!boundtocpu) {
903: if (size > 1) {
904: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
905: #if defined(H2OPUS_USE_MPI)
906: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
907: #endif
908: } else {
909: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
910: }
911: }
912: #endif
913: if (size == 1) {
914: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
915: PetscReal Anorm;
916: bool verbose;
918: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
919: verbose = a->hara_verbose;
920: PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
921: if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
922: if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
923: a->sampler->SetStream(handle->getMainStream());
924: if (boundtocpu) {
925: a->sampler->SetGPUSampling(false);
926: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
927: #if defined(PETSC_H2OPUS_USE_GPU)
928: } else {
929: a->sampler->SetGPUSampling(true);
930: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
931: #endif
932: }
933: samplingdone = PETSC_TRUE;
934: }
935: }
936: #if defined(PETSC_H2OPUS_USE_GPU)
937: if (!boundtocpu) {
938: delete a->hmatrix;
939: delete a->dist_hmatrix;
940: a->hmatrix = NULL;
941: a->dist_hmatrix = NULL;
942: }
943: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944: #endif
945: PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
947: if (!a->s) a->s = 1.0;
948: A->assembled = PETSC_TRUE;
950: if (samplingdone) {
951: PetscBool check = a->check_construction;
952: PetscBool checke = PETSC_FALSE;
954: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
955: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
956: if (check) {
957: Mat E, Ae;
958: PetscReal n1, ni, n2;
959: PetscReal n1A, niA, n2A;
960: void (*normfunc)(void);
962: Ae = a->sampler->GetSamplingMat();
963: PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
964: PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
965: PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
966: PetscCall(MatNorm(E, NORM_1, &n1));
967: PetscCall(MatNorm(E, NORM_INFINITY, &ni));
968: PetscCall(MatNorm(E, NORM_2, &n2));
969: if (checke) {
970: Mat eA, eE, eAe;
972: PetscCall(MatComputeOperator(A, MATAIJ, &eA));
973: PetscCall(MatComputeOperator(E, MATAIJ, &eE));
974: PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
975: PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976: PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
977: PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
978: PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
979: PetscCall(MatView(eA, NULL));
980: PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
981: PetscCall(MatView(eAe, NULL));
982: PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
983: PetscCall(MatView(eE, NULL));
984: PetscCall(MatDestroy(&eA));
985: PetscCall(MatDestroy(&eE));
986: PetscCall(MatDestroy(&eAe));
987: }
989: PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
990: PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
991: PetscCall(MatNorm(Ae, NORM_1, &n1A));
992: PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
993: PetscCall(MatNorm(Ae, NORM_2, &n2A));
994: n1A = PetscMax(n1A, PETSC_SMALL);
995: n2A = PetscMax(n2A, PETSC_SMALL);
996: niA = PetscMax(niA, PETSC_SMALL);
997: PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
998: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
999: PetscCall(MatDestroy(&E));
1000: }
1001: a->sampler->SetSamplingMat(NULL);
1002: }
1003: PetscFunctionReturn(PETSC_SUCCESS);
1004: }
1006: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007: {
1008: PetscMPIInt size;
1009: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1011: PetscFunctionBegin;
1012: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1013: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1014: a->hmatrix->clearData();
1015: #if defined(PETSC_H2OPUS_USE_GPU)
1016: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017: #endif
1018: PetscFunctionReturn(PETSC_SUCCESS);
1019: }
1021: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1022: {
1023: Mat A;
1024: Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1025: #if defined(PETSC_H2OPUS_USE_GPU)
1026: PetscBool iscpu = PETSC_FALSE;
1027: #else
1028: PetscBool iscpu = PETSC_TRUE;
1029: #endif
1030: MPI_Comm comm;
1032: PetscFunctionBegin;
1033: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1034: PetscCall(MatCreate(comm, &A));
1035: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1036: PetscCall(MatSetType(A, MATH2OPUS));
1037: PetscCall(MatPropagateSymmetryOptions(B, A));
1038: a = (Mat_H2OPUS *)A->data;
1040: a->eta = b->eta;
1041: a->leafsize = b->leafsize;
1042: a->basisord = b->basisord;
1043: a->max_rank = b->max_rank;
1044: a->bs = b->bs;
1045: a->rtol = b->rtol;
1046: a->norm_max_samples = b->norm_max_samples;
1047: if (op == MAT_COPY_VALUES) a->s = b->s;
1049: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1050: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1052: #if defined(H2OPUS_USE_MPI)
1053: if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1054: #if defined(PETSC_H2OPUS_USE_GPU)
1055: if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1056: #endif
1057: #endif
1058: if (b->hmatrix) {
1059: a->hmatrix = new HMatrix(*b->hmatrix);
1060: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1061: }
1062: #if defined(PETSC_H2OPUS_USE_GPU)
1063: if (b->hmatrix_gpu) {
1064: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1065: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1066: }
1067: #endif
1068: if (b->sf) {
1069: PetscCall(PetscObjectReference((PetscObject)b->sf));
1070: a->sf = b->sf;
1071: }
1072: if (b->h2opus_indexmap) {
1073: PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1074: a->h2opus_indexmap = b->h2opus_indexmap;
1075: }
1077: PetscCall(MatSetUp(A));
1078: PetscCall(MatSetUpMultiply_H2OPUS(A));
1079: if (op == MAT_COPY_VALUES) {
1080: A->assembled = PETSC_TRUE;
1081: a->orthogonal = b->orthogonal;
1082: #if defined(PETSC_H2OPUS_USE_GPU)
1083: A->offloadmask = B->offloadmask;
1084: #endif
1085: }
1086: #if defined(PETSC_H2OPUS_USE_GPU)
1087: iscpu = B->boundtocpu;
1088: #endif
1089: PetscCall(MatBindToCPU(A, iscpu));
1091: *nA = A;
1092: PetscFunctionReturn(PETSC_SUCCESS);
1093: }
1095: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1096: {
1097: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1098: PetscBool isascii, vieweps;
1099: PetscMPIInt size;
1100: PetscViewerFormat format;
1102: PetscFunctionBegin;
1103: PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1104: PetscCall(PetscViewerGetFormat(view, &format));
1105: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1106: if (isascii) {
1107: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1108: if (size == 1) {
1109: FILE *fp;
1110: PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1111: dumpHMatrix(*h2opus->hmatrix, 6, fp);
1112: }
1113: } else {
1114: PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1115: PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1116: PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1117: if (!h2opus->kernel) {
1118: PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1119: } else {
1120: PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1121: }
1122: PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1123: if (size == 1) {
1124: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1125: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1126: #if defined(PETSC_HAVE_CUDA)
1127: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1128: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1129: #endif
1130: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1131: #if defined(PETSC_HAVE_CUDA)
1132: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1133: #endif
1134: } else {
1135: #if defined(PETSC_HAVE_CUDA)
1136: double matrix_mem[4] = {0., 0., 0., 0.};
1137: PetscMPIInt rsize = 4;
1138: #else
1139: double matrix_mem[2] = {0., 0.};
1140: PetscMPIInt rsize = 2;
1141: #endif
1142: #if defined(H2OPUS_USE_MPI)
1143: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1144: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1145: #if defined(PETSC_HAVE_CUDA)
1146: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1147: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1148: #endif
1149: #endif
1150: PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1151: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1152: #if defined(PETSC_HAVE_CUDA)
1153: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1154: #endif
1155: }
1156: }
1157: }
1158: vieweps = PETSC_FALSE;
1159: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1160: if (vieweps) {
1161: char filename[256];
1162: const char *name;
1164: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1165: PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1166: PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1167: outputEps(*h2opus->hmatrix, filename);
1168: }
1169: PetscFunctionReturn(PETSC_SUCCESS);
1170: }
1172: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1173: {
1174: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1175: PetscReal *gcoords;
1176: PetscInt N;
1177: MPI_Comm comm;
1178: PetscMPIInt size;
1179: PetscBool cong;
1181: PetscFunctionBegin;
1182: PetscCall(PetscLayoutSetUp(A->rmap));
1183: PetscCall(PetscLayoutSetUp(A->cmap));
1184: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1185: PetscCall(MatHasCongruentLayouts(A, &cong));
1186: PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1187: N = A->rmap->N;
1188: PetscCallMPI(MPI_Comm_size(comm, &size));
1189: if (spacedim > 0 && size > 1 && cdist) {
1190: PetscSF sf;
1191: MPI_Datatype dtype;
1193: PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1194: PetscCallMPI(MPI_Type_commit(&dtype));
1196: PetscCall(PetscSFCreate(comm, &sf));
1197: PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1198: PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1199: PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1200: PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1201: PetscCall(PetscSFDestroy(&sf));
1202: PetscCallMPI(MPI_Type_free(&dtype));
1203: } else gcoords = (PetscReal *)coords;
1205: delete h2opus->ptcloud;
1206: delete h2opus->kernel;
1207: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1208: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1209: if (gcoords != coords) PetscCall(PetscFree(gcoords));
1210: A->preallocated = PETSC_TRUE;
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: #if defined(PETSC_H2OPUS_USE_GPU)
1215: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1216: {
1217: PetscMPIInt size;
1218: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1220: PetscFunctionBegin;
1221: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1222: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1223: if (size > 1) {
1224: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225: #if defined(H2OPUS_USE_MPI)
1226: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1227: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1228: #endif
1229: } else {
1230: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1231: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1232: else *a->hmatrix = *a->hmatrix_gpu;
1233: }
1234: delete a->hmatrix_gpu;
1235: delete a->dist_hmatrix_gpu;
1236: a->hmatrix_gpu = NULL;
1237: a->dist_hmatrix_gpu = NULL;
1238: A->offloadmask = PETSC_OFFLOAD_CPU;
1239: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1240: if (size > 1) {
1241: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242: #if defined(H2OPUS_USE_MPI)
1243: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1244: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1245: #endif
1246: } else {
1247: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1248: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1249: else *a->hmatrix_gpu = *a->hmatrix;
1250: }
1251: delete a->hmatrix;
1252: delete a->dist_hmatrix;
1253: a->hmatrix = NULL;
1254: a->dist_hmatrix = NULL;
1255: A->offloadmask = PETSC_OFFLOAD_GPU;
1256: }
1257: PetscCall(PetscFree(A->defaultvectype));
1258: if (!flg) {
1259: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1260: } else {
1261: PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1262: }
1263: A->boundtocpu = flg;
1264: PetscFunctionReturn(PETSC_SUCCESS);
1265: }
1266: #endif
1268: /*MC
1269: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1271: Options Database Key:
1272: . -mat_type h2opus - matrix type to "h2opus"
1274: Level: beginner
1276: Notes:
1277: H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
1279: For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1280: In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1282: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1283: M*/
1284: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1285: {
1286: Mat_H2OPUS *a;
1287: PetscMPIInt size;
1289: PetscFunctionBegin;
1290: #if defined(PETSC_H2OPUS_USE_GPU)
1291: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1292: #endif
1293: PetscCall(PetscNew(&a));
1294: A->data = (void *)a;
1296: a->eta = 0.9;
1297: a->leafsize = 32;
1298: a->basisord = 4;
1299: a->max_rank = 64;
1300: a->bs = 32;
1301: a->rtol = 1.e-4;
1302: a->s = 1.0;
1303: a->norm_max_samples = 10;
1304: a->resize = PETSC_TRUE; /* reallocate after compression */
1305: #if defined(H2OPUS_USE_MPI)
1306: h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1307: #else
1308: h2opusCreateHandle(&a->handle);
1309: #endif
1310: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1311: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1312: PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1314: A->ops->destroy = MatDestroy_H2OPUS;
1315: A->ops->view = MatView_H2OPUS;
1316: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1317: A->ops->mult = MatMult_H2OPUS;
1318: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1319: A->ops->multadd = MatMultAdd_H2OPUS;
1320: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1321: A->ops->scale = MatScale_H2OPUS;
1322: A->ops->duplicate = MatDuplicate_H2OPUS;
1323: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1324: A->ops->norm = MatNorm_H2OPUS;
1325: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1326: #if defined(PETSC_H2OPUS_USE_GPU)
1327: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1328: #endif
1330: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1331: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1332: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1333: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1334: #if defined(PETSC_H2OPUS_USE_GPU)
1335: PetscCall(PetscFree(A->defaultvectype));
1336: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1337: #endif
1338: PetscFunctionReturn(PETSC_SUCCESS);
1339: }
1341: /*@C
1342: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1344: Input Parameter:
1345: . A - the matrix
1347: Level: intermediate
1349: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1350: @*/
1351: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1352: {
1353: PetscBool ish2opus;
1354: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1355: PetscMPIInt size;
1356: PetscBool boundtocpu = PETSC_TRUE;
1358: PetscFunctionBegin;
1361: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1362: if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1363: if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1364: HLibProfile::clear();
1365: PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1366: #if defined(PETSC_H2OPUS_USE_GPU)
1367: boundtocpu = A->boundtocpu;
1368: #endif
1369: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1370: if (size > 1) {
1371: if (boundtocpu) {
1372: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1373: #if defined(H2OPUS_USE_MPI)
1374: distributed_horthog(*a->dist_hmatrix, a->handle);
1375: #endif
1376: #if defined(PETSC_H2OPUS_USE_GPU)
1377: A->offloadmask = PETSC_OFFLOAD_CPU;
1378: } else {
1379: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1380: PetscCall(PetscLogGpuTimeBegin());
1381: #if defined(H2OPUS_USE_MPI)
1382: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1383: #endif
1384: PetscCall(PetscLogGpuTimeEnd());
1385: #endif
1386: }
1387: } else {
1388: #if defined(H2OPUS_USE_MPI)
1389: h2opusHandle_t handle = a->handle->handle;
1390: #else
1391: h2opusHandle_t handle = a->handle;
1392: #endif
1393: if (boundtocpu) {
1394: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1395: horthog(*a->hmatrix, handle);
1396: #if defined(PETSC_H2OPUS_USE_GPU)
1397: A->offloadmask = PETSC_OFFLOAD_CPU;
1398: } else {
1399: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1400: PetscCall(PetscLogGpuTimeBegin());
1401: horthog(*a->hmatrix_gpu, handle);
1402: PetscCall(PetscLogGpuTimeEnd());
1403: #endif
1404: }
1405: }
1406: a->orthogonal = PETSC_TRUE;
1407: { /* log flops */
1408: double gops, time, perf, dev;
1409: HLibProfile::getHorthogPerf(gops, time, perf, dev);
1410: #if defined(PETSC_H2OPUS_USE_GPU)
1411: if (boundtocpu) {
1412: PetscCall(PetscLogFlops(1e9 * gops));
1413: } else {
1414: PetscCall(PetscLogGpuFlops(1e9 * gops));
1415: }
1416: #else
1417: PetscCall(PetscLogFlops(1e9 * gops));
1418: #endif
1419: }
1420: PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1421: PetscFunctionReturn(PETSC_SUCCESS);
1422: }
1424: /*@C
1425: MatH2OpusCompress - Compress a hierarchical matrix.
1427: Input Parameters:
1428: + A - the matrix
1429: - tol - the absolute truncation threshold
1431: Level: intermediate
1433: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1434: @*/
1435: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1436: {
1437: PetscBool ish2opus;
1438: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1439: PetscMPIInt size;
1440: PetscBool boundtocpu = PETSC_TRUE;
1442: PetscFunctionBegin;
1446: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1447: if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1448: PetscCall(MatH2OpusOrthogonalize(A));
1449: HLibProfile::clear();
1450: PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1451: #if defined(PETSC_H2OPUS_USE_GPU)
1452: boundtocpu = A->boundtocpu;
1453: #endif
1454: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1455: if (size > 1) {
1456: if (boundtocpu) {
1457: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1458: #if defined(H2OPUS_USE_MPI)
1459: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1460: if (a->resize) {
1461: DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1462: delete a->dist_hmatrix;
1463: a->dist_hmatrix = dist_hmatrix;
1464: }
1465: #endif
1466: #if defined(PETSC_H2OPUS_USE_GPU)
1467: A->offloadmask = PETSC_OFFLOAD_CPU;
1468: } else {
1469: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1470: PetscCall(PetscLogGpuTimeBegin());
1471: #if defined(H2OPUS_USE_MPI)
1472: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1474: if (a->resize) {
1475: DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1476: delete a->dist_hmatrix_gpu;
1477: a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1478: }
1479: #endif
1480: PetscCall(PetscLogGpuTimeEnd());
1481: #endif
1482: }
1483: } else {
1484: #if defined(H2OPUS_USE_MPI)
1485: h2opusHandle_t handle = a->handle->handle;
1486: #else
1487: h2opusHandle_t handle = a->handle;
1488: #endif
1489: if (boundtocpu) {
1490: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1491: hcompress(*a->hmatrix, tol, handle);
1493: if (a->resize) {
1494: HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1495: delete a->hmatrix;
1496: a->hmatrix = hmatrix;
1497: }
1498: #if defined(PETSC_H2OPUS_USE_GPU)
1499: A->offloadmask = PETSC_OFFLOAD_CPU;
1500: } else {
1501: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1502: PetscCall(PetscLogGpuTimeBegin());
1503: hcompress(*a->hmatrix_gpu, tol, handle);
1504: PetscCall(PetscLogGpuTimeEnd());
1506: if (a->resize) {
1507: HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1508: delete a->hmatrix_gpu;
1509: a->hmatrix_gpu = hmatrix_gpu;
1510: }
1511: #endif
1512: }
1513: }
1514: { /* log flops */
1515: double gops, time, perf, dev;
1516: HLibProfile::getHcompressPerf(gops, time, perf, dev);
1517: #if defined(PETSC_H2OPUS_USE_GPU)
1518: if (boundtocpu) {
1519: PetscCall(PetscLogFlops(1e9 * gops));
1520: } else {
1521: PetscCall(PetscLogGpuFlops(1e9 * gops));
1522: }
1523: #else
1524: PetscCall(PetscLogFlops(1e9 * gops));
1525: #endif
1526: }
1527: PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1528: PetscFunctionReturn(PETSC_SUCCESS);
1529: }
1531: /*@C
1532: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1534: Input Parameters:
1535: + A - the hierarchical matrix
1536: . B - the matrix to be sampled
1537: . bs - maximum number of samples to be taken concurrently
1538: - tol - relative tolerance for construction
1540: Level: intermediate
1542: Notes:
1543: You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1545: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1546: @*/
1547: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1548: {
1549: PetscBool ish2opus;
1551: PetscFunctionBegin;
1557: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1558: if (ish2opus) {
1559: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1561: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1562: a->sampler->SetSamplingMat(B);
1563: if (bs > 0) a->bs = bs;
1564: if (tol > 0.) a->rtol = tol;
1565: delete a->kernel;
1566: }
1567: PetscFunctionReturn(PETSC_SUCCESS);
1568: }
1570: /*@C
1571: MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1573: Input Parameters:
1574: + comm - MPI communicator
1575: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1576: . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1577: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1578: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1579: . spacedim - dimension of the space coordinates
1580: . coords - coordinates of the points
1581: . cdist - whether or not coordinates are distributed
1582: . kernel - computational kernel (or `NULL`)
1583: . kernelctx - kernel context
1584: . eta - admissibility condition tolerance
1585: . leafsize - leaf size in cluster tree
1586: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1588: Output Parameter:
1589: . nA - matrix
1591: Options Database Keys:
1592: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1593: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1594: . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1595: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1597: Level: intermediate
1599: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1600: @*/
1601: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1602: {
1603: Mat A;
1604: Mat_H2OPUS *h2opus;
1605: #if defined(PETSC_H2OPUS_USE_GPU)
1606: PetscBool iscpu = PETSC_FALSE;
1607: #else
1608: PetscBool iscpu = PETSC_TRUE;
1609: #endif
1611: PetscFunctionBegin;
1612: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1613: PetscCall(MatCreate(comm, &A));
1614: PetscCall(MatSetSizes(A, m, n, M, N));
1615: PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1616: PetscCall(MatSetType(A, MATH2OPUS));
1617: PetscCall(MatBindToCPU(A, iscpu));
1618: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1620: h2opus = (Mat_H2OPUS *)A->data;
1621: if (eta > 0.) h2opus->eta = eta;
1622: if (leafsize > 0) h2opus->leafsize = leafsize;
1623: if (basisord > 0) h2opus->basisord = basisord;
1625: *nA = A;
1626: PetscFunctionReturn(PETSC_SUCCESS);
1627: }
1629: /*@C
1630: MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1632: Input Parameters:
1633: + B - the matrix to be sampled
1634: . spacedim - dimension of the space coordinates
1635: . coords - coordinates of the points
1636: . cdist - whether or not coordinates are distributed
1637: . eta - admissibility condition tolerance
1638: . leafsize - leaf size in cluster tree
1639: . maxrank - maximum rank allowed
1640: . bs - maximum number of samples to be taken concurrently
1641: - rtol - relative tolerance for construction
1643: Output Parameter:
1644: . nA - matrix
1646: Options Database Keys:
1647: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1648: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1649: . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1650: . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1651: . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1652: . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1653: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1654: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1656: Level: intermediate
1658: Note:
1659: Not available in parallel
1661: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1662: @*/
1663: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1664: {
1665: Mat A;
1666: Mat_H2OPUS *h2opus;
1667: MPI_Comm comm;
1668: PetscBool boundtocpu = PETSC_TRUE;
1670: PetscFunctionBegin;
1679: PetscAssertPointer(nA, 10);
1680: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1681: PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1682: PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1683: PetscCall(MatCreate(comm, &A));
1684: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1685: #if defined(PETSC_H2OPUS_USE_GPU)
1686: {
1687: VecType vtype;
1688: PetscBool isstd, iscuda, iskok;
1690: PetscCall(MatGetVecType(B, &vtype));
1691: PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1692: PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1693: PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1694: PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1695: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1696: if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1697: }
1698: #endif
1699: PetscCall(MatSetType(A, MATH2OPUS));
1700: PetscCall(MatBindToCPU(A, boundtocpu));
1701: if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1702: PetscCall(MatPropagateSymmetryOptions(B, A));
1703: /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1705: h2opus = (Mat_H2OPUS *)A->data;
1706: h2opus->sampler = new PetscMatrixSampler(B);
1707: if (eta > 0.) h2opus->eta = eta;
1708: if (leafsize > 0) h2opus->leafsize = leafsize;
1709: if (maxrank > 0) h2opus->max_rank = maxrank;
1710: if (bs > 0) h2opus->bs = bs;
1711: if (rtol > 0.) h2opus->rtol = rtol;
1712: *nA = A;
1713: A->preallocated = PETSC_TRUE;
1714: PetscFunctionReturn(PETSC_SUCCESS);
1715: }
1717: /*@C
1718: MatH2OpusGetIndexMap - Access reordering index set.
1720: Input Parameter:
1721: . A - the matrix
1723: Output Parameter:
1724: . indexmap - the index set for the reordering
1726: Level: intermediate
1728: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1729: @*/
1730: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1731: {
1732: PetscBool ish2opus;
1733: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1735: PetscFunctionBegin;
1738: PetscAssertPointer(indexmap, 2);
1739: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1740: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1741: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1742: *indexmap = a->h2opus_indexmap;
1743: PetscFunctionReturn(PETSC_SUCCESS);
1744: }
1746: /*@C
1747: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1749: Input Parameters:
1750: + A - the matrix
1751: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1752: - in - the vector to be mapped
1754: Output Parameter:
1755: . out - the newly created mapped vector
1757: Level: intermediate
1759: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1760: @*/
1761: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1762: {
1763: PetscBool ish2opus;
1764: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1765: PetscScalar *xin, *xout;
1766: PetscBool nm;
1768: PetscFunctionBegin;
1773: PetscAssertPointer(out, 4);
1774: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1775: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1776: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1777: nm = a->nativemult;
1778: PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1779: PetscCall(MatCreateVecs(A, out, NULL));
1780: PetscCall(MatH2OpusSetNativeMult(A, nm));
1781: if (!a->sf) { /* same ordering */
1782: PetscCall(VecCopy(in, *out));
1783: PetscFunctionReturn(PETSC_SUCCESS);
1784: }
1785: PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1786: PetscCall(VecGetArrayWrite(*out, &xout));
1787: if (nativetopetsc) {
1788: PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1789: PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1790: } else {
1791: PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792: PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1793: }
1794: PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1795: PetscCall(VecRestoreArrayWrite(*out, &xout));
1796: PetscFunctionReturn(PETSC_SUCCESS);
1797: }
1799: /*@C
1800: MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1802: Input Parameters:
1803: + A - the hierarchical `MATH2OPUS` matrix
1804: . s - the scaling factor
1805: . U - the dense low-rank update matrix
1806: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1808: Note:
1809: The `U` and `V` matrices must be in `MATDENSE` dense format
1811: Level: intermediate
1813: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1814: @*/
1815: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1816: {
1817: PetscBool flg;
1819: PetscFunctionBegin;
1822: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1824: PetscCheckSameComm(A, 1, U, 2);
1825: if (V) {
1827: PetscCheckSameComm(A, 1, V, 3);
1828: }
1831: if (!V) V = U;
1832: PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1833: if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1834: PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1835: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1836: PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1837: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1838: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1839: if (flg) {
1840: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1841: const PetscScalar *u, *v, *uu, *vv;
1842: PetscInt ldu, ldv;
1843: PetscMPIInt size;
1844: #if defined(H2OPUS_USE_MPI)
1845: h2opusHandle_t handle = a->handle->handle;
1846: #else
1847: h2opusHandle_t handle = a->handle;
1848: #endif
1849: PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1850: PetscSF usf, vsf;
1852: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1853: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1854: PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1855: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1856: PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1857: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1858: PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1859: PetscCall(MatDenseGetLDA(U, &ldu));
1860: PetscCall(MatDenseGetLDA(V, &ldv));
1861: PetscCall(MatBoundToCPU(A, &flg));
1862: if (usesf) {
1863: PetscInt n;
1865: PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf));
1866: PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf));
1867: PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1868: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1869: ldu = n;
1870: ldv = n;
1871: }
1872: if (flg) {
1873: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1874: PetscCall(MatDenseGetArrayRead(U, &u));
1875: PetscCall(MatDenseGetArrayRead(V, &v));
1876: if (usesf) {
1877: vv = MatH2OpusGetThrustPointer(*a->yy);
1878: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1879: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1880: if (U != V) {
1881: uu = MatH2OpusGetThrustPointer(*a->xx);
1882: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1883: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1884: } else uu = vv;
1885: } else {
1886: uu = u;
1887: vv = v;
1888: }
1889: hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1890: PetscCall(MatDenseRestoreArrayRead(U, &u));
1891: PetscCall(MatDenseRestoreArrayRead(V, &v));
1892: } else {
1893: #if defined(PETSC_H2OPUS_USE_GPU)
1894: PetscBool flgU, flgV;
1896: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1897: PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1898: if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1899: PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1900: if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1901: PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1902: PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1903: if (usesf) {
1904: vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1905: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1906: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1907: if (U != V) {
1908: uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1909: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1910: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1911: } else uu = vv;
1912: } else {
1913: uu = u;
1914: vv = v;
1915: }
1916: #else
1917: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1918: #endif
1919: hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1920: #if defined(PETSC_H2OPUS_USE_GPU)
1921: PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1922: PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1923: if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1924: if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1925: #endif
1926: }
1927: PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1928: a->orthogonal = PETSC_FALSE;
1929: }
1930: PetscFunctionReturn(PETSC_SUCCESS);
1931: }
1932: #endif