Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petscsf.h>
17: /* math2opusutils */
18: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
19: PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
20: PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
21: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
23: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
25: /* Use GPU only if H2OPUS is configured for GPU */
26: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
27: #define PETSC_H2OPUS_USE_GPU
28: #endif
29: #if defined(PETSC_H2OPUS_USE_GPU)
30: #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
31: #else
32: #define MatH2OpusUpdateIfNeeded(A,B) 0
33: #endif
35: // TODO H2OPUS:
36: // DistributedHMatrix
37: // unsymmetric ?
38: // transpose for distributed_hgemv?
39: // clearData()
40: // Unify interface for sequential and parallel?
41: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
42: //
43: template <class T> class PetscPointCloud : public H2OpusDataSet<T>
44: {
45: private:
46: int dimension;
47: size_t num_points;
48: std::vector<T> pts;
50: public:
51: PetscPointCloud(int dim, size_t num_pts, const T coords[])
52: {
53: this->dimension = dim;
54: this->num_points = num_pts;
56: pts.resize(num_pts*dim);
57: if (coords) {
58: for (size_t n = 0; n < num_points; n++)
59: for (int i = 0; i < dim; i++)
60: pts[n*dim + i] = coords[n*dim + i];
61: } else {
62: PetscReal h = 1./(num_points - 1);
63: for (size_t n = 0; n < num_points; n++)
64: for (int i = 0; i < dim; i++)
65: pts[n*dim + i] = i*h;
66: }
67: }
69: PetscPointCloud(const PetscPointCloud<T>& other)
70: {
71: size_t N = other.dimension * other.num_points;
72: this->dimension = other.dimension;
73: this->num_points = other.num_points;
74: this->pts.resize(N);
75: for (size_t i = 0; i < N; i++)
76: this->pts[i] = other.pts[i];
77: }
79: int getDimension() const
80: {
81: return dimension;
82: }
84: size_t getDataSetSize() const
85: {
86: return num_points;
87: }
89: T getDataPoint(size_t idx, int dim) const
90: {
91: assert(dim < dimension && idx < num_points);
92: return pts[idx*dimension + dim];
93: }
95: void Print(std::ostream& out = std::cout)
96: {
97: out << "Dimension: " << dimension << std::endl;
98: out << "NumPoints: " << num_points << std::endl;
99: for (size_t n = 0; n < num_points; n++) {
100: for (int d = 0; d < dimension; d++)
101: out << pts[n*dimension + d] << " ";
102: out << std::endl;
103: }
104: }
105: };
107: template<class T> class PetscFunctionGenerator
108: {
109: private:
110: MatH2OpusKernel k;
111: int dim;
112: void *ctx;
114: public:
115: PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
116: PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
117: T operator()(PetscReal *pt1, PetscReal *pt2)
118: {
119: return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
120: }
121: };
123: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
125: /* just to not clutter the code */
126: #if !defined(H2OPUS_USE_GPU)
127: typedef HMatrix HMatrix_GPU;
128: #if defined(H2OPUS_USE_MPI)
129: typedef DistributedHMatrix DistributedHMatrix_GPU;
130: #endif
131: #endif
133: typedef struct {
134: #if defined(H2OPUS_USE_MPI)
135: distributedH2OpusHandle_t handle;
136: #else
137: h2opusHandle_t handle;
138: #endif
139: /* Sequential and parallel matrices are two different classes at the moment */
140: HMatrix *hmatrix;
141: #if defined(H2OPUS_USE_MPI)
142: DistributedHMatrix *dist_hmatrix;
143: #else
144: HMatrix *dist_hmatrix; /* just to not clutter the code */
145: #endif
146: /* May use permutations */
147: PetscSF sf;
148: PetscLayout h2opus_rmap, h2opus_cmap;
149: IS h2opus_indexmap;
150: thrust::host_vector<PetscScalar> *xx,*yy;
151: PetscInt xxs,yys;
152: PetscBool multsetup;
154: /* GPU */
155: HMatrix_GPU *hmatrix_gpu;
156: #if defined(H2OPUS_USE_MPI)
157: DistributedHMatrix_GPU *dist_hmatrix_gpu;
158: #else
159: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
160: #endif
161: #if defined(PETSC_H2OPUS_USE_GPU)
162: thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
163: PetscInt xxs_gpu,yys_gpu;
164: #endif
166: /* construction from matvecs */
167: PetscMatrixSampler* sampler;
168: PetscBool nativemult;
170: /* Admissibility */
171: PetscReal eta;
172: PetscInt leafsize;
174: /* for dof reordering */
175: PetscPointCloud<PetscReal> *ptcloud;
177: /* kernel for generating matrix entries */
178: PetscFunctionGenerator<PetscScalar> *kernel;
180: /* basis orthogonalized? */
181: PetscBool orthogonal;
183: /* customization */
184: PetscInt basisord;
185: PetscInt max_rank;
186: PetscInt bs;
187: PetscReal rtol;
188: PetscInt norm_max_samples;
189: PetscBool check_construction;
190: PetscBool hara_verbose;
192: /* keeps track of MatScale values */
193: PetscScalar s;
194: } Mat_H2OPUS;
196: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
197: {
198: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
202: #if defined(H2OPUS_USE_MPI)
203: h2opusDestroyDistributedHandle(a->handle);
204: #else
205: h2opusDestroyHandle(a->handle);
206: #endif
207: delete a->dist_hmatrix;
208: delete a->hmatrix;
209: PetscSFDestroy(&a->sf);
210: PetscLayoutDestroy(&a->h2opus_rmap);
211: PetscLayoutDestroy(&a->h2opus_cmap);
212: ISDestroy(&a->h2opus_indexmap);
213: delete a->xx;
214: delete a->yy;
215: delete a->hmatrix_gpu;
216: delete a->dist_hmatrix_gpu;
217: #if defined(PETSC_H2OPUS_USE_GPU)
218: delete a->xx_gpu;
219: delete a->yy_gpu;
220: #endif
221: delete a->sampler;
222: delete a->ptcloud;
223: delete a->kernel;
224: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL);
225: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL);
226: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL);
227: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL);
228: PetscObjectChangeTypeName((PetscObject)A,NULL);
229: PetscFree(A->data);
230: return(0);
231: }
233: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
234: {
235: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
236: PetscBool ish2opus;
242: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
243: if (ish2opus) {
244: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246: PetscLayout t;
247: t = A->rmap;
248: A->rmap = a->h2opus_rmap;
249: a->h2opus_rmap = t;
250: t = A->cmap;
251: A->cmap = a->h2opus_cmap;
252: a->h2opus_cmap = t;
253: }
254: }
255: a->nativemult = nm;
256: }
257: return(0);
258: }
260: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261: {
262: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
263: PetscBool ish2opus;
269: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
270: if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
271: *nm = a->nativemult;
272: return(0);
273: }
275: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
276: {
278: PetscBool ish2opus;
279: PetscInt nmax = PETSC_DECIDE;
280: Mat_H2OPUS *a = NULL;
281: PetscBool mult = PETSC_FALSE;
284: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
285: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
286: a = (Mat_H2OPUS*)A->data;
288: nmax = a->norm_max_samples;
289: mult = a->nativemult;
290: MatH2OpusSetNativeMult(A,PETSC_TRUE);
291: } else {
292: PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL);
293: }
294: MatApproximateNorm_Private(A,normtype,nmax,n);
295: if (a) { MatH2OpusSetNativeMult(A,mult); }
296: return(0);
297: }
299: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
300: {
301: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
302: #if defined(H2OPUS_USE_MPI)
303: h2opusHandle_t handle = h2opus->handle->handle;
304: #else
305: h2opusHandle_t handle = h2opus->handle;
306: #endif
307: PetscBool boundtocpu = PETSC_TRUE;
308: PetscScalar *xx,*yy,*uxx,*uyy;
309: PetscInt blda,clda;
310: PetscMPIInt size;
311: PetscSF bsf,csf;
312: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
316: HLibProfile::clear();
317: #if defined(PETSC_H2OPUS_USE_GPU)
318: boundtocpu = A->boundtocpu;
319: #endif
320: MatDenseGetLDA(B,&blda);
321: MatDenseGetLDA(C,&clda);
322: if (usesf) {
323: PetscInt n;
325: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
326: PetscObjectQuery((PetscObject)B,"_math2opus_vectorsf",(PetscObject*)&bsf);
327: if (!bsf) {
328: PetscSFGetVectorSF(h2opus->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);
329: PetscObjectCompose((PetscObject)B,"_math2opus_vectorsf",(PetscObject)bsf);
330: PetscObjectDereference((PetscObject)bsf);
331: }
332: PetscObjectQuery((PetscObject)C,"_math2opus_vectorsf",(PetscObject*)&csf);
333: if (!csf) {
334: PetscSFGetVectorSF(h2opus->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);
335: PetscObjectCompose((PetscObject)C,"_math2opus_vectorsf",(PetscObject)csf);
336: PetscObjectDereference((PetscObject)csf);
337: }
338: blda = n;
339: clda = n;
340: }
341: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
342: if (boundtocpu) {
343: if (usesf) {
344: PetscInt n;
346: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
347: if (h2opus->xxs < B->cmap->n) { h2opus->xx->resize(n*B->cmap->N); h2opus->xxs = B->cmap->N; }
348: if (h2opus->yys < B->cmap->n) { h2opus->yy->resize(n*B->cmap->N); h2opus->yys = B->cmap->N; }
349: }
350: MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
351: MatDenseGetArrayWrite(C,&yy);
352: if (usesf) {
353: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
354: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
355: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
356: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
357: } else {
358: uxx = xx;
359: uyy = yy;
360: }
361: if (size > 1) {
362: if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
363: if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
364: #if defined(H2OPUS_USE_MPI)
365: distributed_hgemv(/*transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
366: #endif
367: } else {
368: if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
369: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
370: }
371: MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
372: if (usesf) {
373: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
374: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
375: }
376: MatDenseRestoreArrayWrite(C,&yy);
377: #if defined(PETSC_H2OPUS_USE_GPU)
378: } else {
379: PetscBool ciscuda,biscuda;
381: if (usesf) {
382: PetscInt n;
384: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
385: if (h2opus->xxs_gpu < B->cmap->n) { h2opus->xx_gpu->resize(n*B->cmap->N); h2opus->xxs_gpu = B->cmap->N; }
386: if (h2opus->yys_gpu < B->cmap->n) { h2opus->yy_gpu->resize(n*B->cmap->N); h2opus->yys_gpu = B->cmap->N; }
387: }
388: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
389: PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
390: if (!biscuda) {
391: MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
392: }
393: PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
394: if (!ciscuda) {
395: C->assembled = PETSC_TRUE;
396: MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
397: }
398: MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
399: MatDenseCUDAGetArrayWrite(C,&yy);
400: if (usesf) {
401: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
402: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
403: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
404: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
405: } else {
406: uxx = xx;
407: uyy = yy;
408: }
409: PetscLogGpuTimeBegin();
410: if (size > 1) {
411: if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
412: if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
413: #if defined(H2OPUS_USE_MPI)
414: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
415: #endif
416: } else {
417: if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
418: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
419: }
420: PetscLogGpuTimeEnd();
421: MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
422: if (usesf) {
423: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
424: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
425: }
426: MatDenseCUDARestoreArrayWrite(C,&yy);
427: if (!biscuda) {
428: MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
429: }
430: if (!ciscuda) {
431: MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
432: }
433: #endif
434: }
435: { /* log flops */
436: double gops,time,perf,dev;
437: HLibProfile::getHgemvPerf(gops,time,perf,dev);
438: #if defined(PETSC_H2OPUS_USE_GPU)
439: if (boundtocpu) {
440: PetscLogFlops(1e9*gops);
441: } else {
442: PetscLogGpuFlops(1e9*gops);
443: }
444: #else
445: PetscLogFlops(1e9*gops);
446: #endif
447: }
448: return(0);
449: }
451: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452: {
453: Mat_Product *product = C->product;
457: MatCheckProduct(C,1);
458: switch (product->type) {
459: case MATPRODUCT_AB:
460: MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C);
461: break;
462: case MATPRODUCT_AtB:
463: MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C);
464: break;
465: default:
466: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
467: }
468: return(0);
469: }
471: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
472: {
474: Mat_Product *product = C->product;
475: PetscBool cisdense;
476: Mat A,B;
479: MatCheckProduct(C,1);
480: A = product->A;
481: B = product->B;
482: switch (product->type) {
483: case MATPRODUCT_AB:
484: MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
485: MatSetBlockSizesFromMats(C,product->A,product->B);
486: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
487: if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
488: MatSetUp(C);
489: break;
490: case MATPRODUCT_AtB:
491: MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
492: MatSetBlockSizesFromMats(C,product->A,product->B);
493: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
494: if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
495: MatSetUp(C);
496: break;
497: default:
498: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
499: }
500: C->ops->productsymbolic = NULL;
501: C->ops->productnumeric = MatProductNumeric_H2OPUS;
502: return(0);
503: }
505: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
506: {
508: MatCheckProduct(C,1);
509: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
510: C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
511: }
512: return(0);
513: }
515: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
516: {
517: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
518: #if defined(H2OPUS_USE_MPI)
519: h2opusHandle_t handle = h2opus->handle->handle;
520: #else
521: h2opusHandle_t handle = h2opus->handle;
522: #endif
523: PetscBool boundtocpu = PETSC_TRUE;
524: PetscInt n;
525: PetscScalar *xx,*yy,*uxx,*uyy;
526: PetscMPIInt size;
527: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
531: HLibProfile::clear();
532: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
533: #if defined(PETSC_H2OPUS_USE_GPU)
534: boundtocpu = A->boundtocpu;
535: #endif
536: if (usesf) {
537: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
538: } else n = A->rmap->n;
539: if (boundtocpu) {
540: VecGetArrayRead(x,(const PetscScalar**)&xx);
541: if (sy == 0.0) {
542: VecGetArrayWrite(y,&yy);
543: } else {
544: VecGetArray(y,&yy);
545: }
546: if (usesf) {
547: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
550: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
551: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
552: if (sy != 0.0) {
553: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
554: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
555: }
556: } else {
557: uxx = xx;
558: uyy = yy;
559: }
560: if (size > 1) {
561: if (!h2opus->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562: if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563: #if defined(H2OPUS_USE_MPI)
564: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565: #endif
566: } else {
567: if (!h2opus->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569: }
570: VecRestoreArrayRead(x,(const PetscScalar**)&xx);
571: if (usesf) {
572: PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
573: PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
574: }
575: if (sy == 0.0) {
576: VecRestoreArrayWrite(y,&yy);
577: } else {
578: VecRestoreArray(y,&yy);
579: }
580: #if defined(PETSC_H2OPUS_USE_GPU)
581: } else {
582: VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
583: if (sy == 0.0) {
584: VecCUDAGetArrayWrite(y,&yy);
585: } else {
586: VecCUDAGetArray(y,&yy);
587: }
588: if (usesf) {
589: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
592: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
593: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
594: if (sy != 0.0) {
595: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
596: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
597: }
598: } else {
599: uxx = xx;
600: uyy = yy;
601: }
602: PetscLogGpuTimeBegin();
603: if (size > 1) {
604: if (!h2opus->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605: if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606: #if defined(H2OPUS_USE_MPI)
607: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608: #endif
609: } else {
610: if (!h2opus->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612: }
613: PetscLogGpuTimeEnd();
614: VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
615: if (usesf) {
616: PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
617: PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
618: }
619: if (sy == 0.0) {
620: VecCUDARestoreArrayWrite(y,&yy);
621: } else {
622: VecCUDARestoreArray(y,&yy);
623: }
624: #endif
625: }
626: { /* log flops */
627: double gops,time,perf,dev;
628: HLibProfile::getHgemvPerf(gops,time,perf,dev);
629: #if defined(PETSC_H2OPUS_USE_GPU)
630: if (boundtocpu) {
631: PetscLogFlops(1e9*gops);
632: } else {
633: PetscLogGpuFlops(1e9*gops);
634: }
635: #else
636: PetscLogFlops(1e9*gops);
637: #endif
638: }
639: return(0);
640: }
642: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643: {
644: PetscBool xiscuda,yiscuda;
648: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
649: PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
650: MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
651: MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE);
652: return(0);
653: }
655: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656: {
657: PetscBool xiscuda,yiscuda;
661: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
662: PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
663: MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
664: MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE);
665: return(0);
666: }
668: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
669: {
670: PetscBool xiscuda,ziscuda;
674: VecCopy(y,z);
675: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
676: PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
677: MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
678: MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE);
679: return(0);
680: }
682: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
683: {
684: PetscBool xiscuda,ziscuda;
688: VecCopy(y,z);
689: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
690: PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
691: MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
692: MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE);
693: return(0);
694: }
696: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
697: {
698: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
701: a->s *= s;
702: return(0);
703: }
705: static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
706: {
707: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
711: PetscOptionsHead(PetscOptionsObject,"H2OPUS options");
712: PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL);
713: PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
714: PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
715: PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
716: PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
717: PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL);
718: PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
719: PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
720: PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL);
721: PetscOptionsTail();
722: return(0);
723: }
725: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
727: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
728: {
729: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
730: Vec c;
731: PetscInt spacedim;
732: const PetscScalar *coords;
733: PetscErrorCode ierr;
736: if (a->ptcloud) return(0);
737: PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c);
738: if (!c && a->sampler) {
739: Mat S = a->sampler->GetSamplingMat();
741: PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c);
742: }
743: if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
744: VecGetArrayRead(c,&coords);
745: VecGetBlockSize(c,&spacedim);
746: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL);
747: VecRestoreArrayRead(c,&coords);
748: return(0);
749: }
751: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
752: {
753: MPI_Comm comm;
754: PetscMPIInt size;
756: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
757: PetscInt n = 0,*idx = NULL;
758: int *iidx = NULL;
759: PetscCopyMode own;
760: PetscBool rid;
763: if (a->multsetup) return(0);
764: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
765: PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
766: #if defined(PETSC_H2OPUS_USE_GPU)
767: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
768: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
769: a->xxs_gpu = 1;
770: a->yys_gpu = 1;
771: #endif
772: a->xx = new thrust::host_vector<PetscScalar>(n);
773: a->yy = new thrust::host_vector<PetscScalar>(n);
774: a->xxs = 1;
775: a->yys = 1;
776: } else {
777: IS is;
778: PetscObjectGetComm((PetscObject)A,&comm);
779: MPI_Comm_size(comm,&size);
780: if (!a->h2opus_indexmap) {
781: if (size > 1) {
782: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
783: #if defined(H2OPUS_USE_MPI)
784: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
785: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
786: #endif
787: } else {
788: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
789: n = a->hmatrix->u_basis_tree.index_map.size();
790: }
792: if (PetscDefined(USE_64BIT_INDICES)) {
793: PetscInt i;
795: own = PETSC_OWN_POINTER;
796: PetscMalloc1(n,&idx);
797: for (i=0;i<n;i++) idx[i] = iidx[i];
798: } else {
799: own = PETSC_COPY_VALUES;
800: idx = (PetscInt*)iidx;
801: }
802: ISCreateGeneral(comm,n,idx,own,&is);
803: ISSetPermutation(is);
804: ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view");
805: a->h2opus_indexmap = is;
806: }
807: ISGetLocalSize(a->h2opus_indexmap,&n);
808: ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
809: rid = (PetscBool)(n == A->rmap->n);
810: MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm);
811: if (rid) {
812: ISIdentity(a->h2opus_indexmap,&rid);
813: }
814: if (!rid) {
815: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
816: PetscLayoutCreate(comm,&a->h2opus_rmap);
817: PetscLayoutSetLocalSize(a->h2opus_rmap,n);
818: PetscLayoutSetUp(a->h2opus_rmap);
819: PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap);
820: }
821: PetscSFCreate(comm,&a->sf);
822: PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
823: PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view");
824: #if defined(PETSC_H2OPUS_USE_GPU)
825: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
826: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
827: a->xxs_gpu = 1;
828: a->yys_gpu = 1;
829: #endif
830: a->xx = new thrust::host_vector<PetscScalar>(n);
831: a->yy = new thrust::host_vector<PetscScalar>(n);
832: a->xxs = 1;
833: a->yys = 1;
834: }
835: ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
836: }
837: a->multsetup = PETSC_TRUE;
838: return(0);
839: }
841: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
842: {
843: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
844: #if defined(H2OPUS_USE_MPI)
845: h2opusHandle_t handle = a->handle->handle;
846: #else
847: h2opusHandle_t handle = a->handle;
848: #endif
849: PetscBool kernel = PETSC_FALSE;
850: PetscBool boundtocpu = PETSC_TRUE;
851: PetscBool samplingdone = PETSC_FALSE;
852: MPI_Comm comm;
853: PetscMPIInt size;
857: PetscObjectGetComm((PetscObject)A,&comm);
858: if (A->rmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
859: if (A->rmap->N != A->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
860: MPI_Comm_size(comm,&size);
861: /* TODO REUSABILITY of geometric construction */
862: delete a->hmatrix;
863: delete a->dist_hmatrix;
864: #if defined(PETSC_H2OPUS_USE_GPU)
865: delete a->hmatrix_gpu;
866: delete a->dist_hmatrix_gpu;
867: #endif
868: a->orthogonal = PETSC_FALSE;
870: /* TODO: other? */
871: H2OpusBoxCenterAdmissibility adm(a->eta);
873: PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0);
874: if (size > 1) {
875: #if defined(H2OPUS_USE_MPI)
876: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
877: #else
878: a->dist_hmatrix = NULL;
879: #endif
880: } else {
881: a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
882: }
883: MatH2OpusInferCoordinates_Private(A);
884: if (!a->ptcloud) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
885: if (a->kernel) {
886: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
887: if (size > 1) {
888: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
889: #if defined(H2OPUS_USE_MPI)
890: buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
891: #endif
892: } else {
893: buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
894: }
895: kernel = PETSC_TRUE;
896: } else {
897: if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
898: buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
899: }
900: MatSetUpMultiply_H2OPUS(A);
902: #if defined(PETSC_H2OPUS_USE_GPU)
903: boundtocpu = A->boundtocpu;
904: if (!boundtocpu) {
905: if (size > 1) {
906: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
907: #if defined(H2OPUS_USE_MPI)
908: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
909: #endif
910: } else {
911: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
912: }
913: }
914: #endif
915: if (size == 1) {
916: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
917: PetscReal Anorm;
918: bool verbose;
920: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL);
921: verbose = a->hara_verbose;
922: MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm);
923: if (a->hara_verbose) { PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs); }
924: if (a->sf && !a->nativemult) {
925: a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
926: }
927: a->sampler->SetStream(handle->getMainStream());
928: if (boundtocpu) {
929: a->sampler->SetGPUSampling(false);
930: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
931: #if defined(PETSC_H2OPUS_USE_GPU)
932: } else {
933: a->sampler->SetGPUSampling(true);
934: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
935: #endif
936: }
937: samplingdone = PETSC_TRUE;
938: }
939: }
940: #if defined(PETSC_H2OPUS_USE_GPU)
941: if (!boundtocpu) {
942: delete a->hmatrix;
943: delete a->dist_hmatrix;
944: a->hmatrix = NULL;
945: a->dist_hmatrix = NULL;
946: }
947: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
948: #endif
949: PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0);
951: if (!a->s) a->s = 1.0;
952: A->assembled = PETSC_TRUE;
954: if (samplingdone) {
955: PetscBool check = a->check_construction;
956: PetscBool checke = PETSC_FALSE;
958: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL);
959: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL);
960: if (check) {
961: Mat E,Ae;
962: PetscReal n1,ni,n2;
963: PetscReal n1A,niA,n2A;
964: void (*normfunc)(void);
966: Ae = a->sampler->GetSamplingMat();
967: MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
968: MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
969: MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
970: MatNorm(E,NORM_1,&n1);
971: MatNorm(E,NORM_INFINITY,&ni);
972: MatNorm(E,NORM_2,&n2);
973: if (checke) {
974: Mat eA,eE,eAe;
976: MatComputeOperator(A,MATAIJ,&eA);
977: MatComputeOperator(E,MATAIJ,&eE);
978: MatComputeOperator(Ae,MATAIJ,&eAe);
979: MatChop(eA,PETSC_SMALL);
980: MatChop(eE,PETSC_SMALL);
981: MatChop(eAe,PETSC_SMALL);
982: PetscObjectSetName((PetscObject)eA,"H2Mat");
983: MatView(eA,NULL);
984: PetscObjectSetName((PetscObject)eAe,"S");
985: MatView(eAe,NULL);
986: PetscObjectSetName((PetscObject)eE,"H2Mat - S");
987: MatView(eE,NULL);
988: MatDestroy(&eA);
989: MatDestroy(&eE);
990: MatDestroy(&eAe);
991: }
993: MatGetOperation(Ae,MATOP_NORM,&normfunc);
994: MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
995: MatNorm(Ae,NORM_1,&n1A);
996: MatNorm(Ae,NORM_INFINITY,&niA);
997: MatNorm(Ae,NORM_2,&n2A);
998: n1A = PetscMax(n1A,PETSC_SMALL);
999: n2A = PetscMax(n2A,PETSC_SMALL);
1000: niA = PetscMax(niA,PETSC_SMALL);
1001: MatSetOperation(Ae,MATOP_NORM,normfunc);
1002: PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
1003: MatDestroy(&E);
1004: }
1005: }
1006: return(0);
1007: }
1009: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1010: {
1012: PetscMPIInt size;
1013: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1016: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1017: if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1018: else {
1019: a->hmatrix->clearData();
1020: #if defined(PETSC_H2OPUS_USE_GPU)
1021: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1022: #endif
1023: }
1024: return(0);
1025: }
1027: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1028: {
1029: Mat A;
1030: Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data;
1031: #if defined(PETSC_H2OPUS_USE_GPU)
1032: PetscBool iscpu = PETSC_FALSE;
1033: #else
1034: PetscBool iscpu = PETSC_TRUE;
1035: #endif
1037: MPI_Comm comm;
1040: PetscObjectGetComm((PetscObject)B,&comm);
1041: MatCreate(comm,&A);
1042: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1043: MatSetType(A,MATH2OPUS);
1044: MatPropagateSymmetryOptions(B,A);
1045: a = (Mat_H2OPUS*)A->data;
1047: a->eta = b->eta;
1048: a->leafsize = b->leafsize;
1049: a->basisord = b->basisord;
1050: a->max_rank = b->max_rank;
1051: a->bs = b->bs;
1052: a->rtol = b->rtol;
1053: a->norm_max_samples = b->norm_max_samples;
1054: if (op == MAT_COPY_VALUES) a->s = b->s;
1056: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1057: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1059: #if defined(H2OPUS_USE_MPI)
1060: if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1061: #if defined(PETSC_H2OPUS_USE_GPU)
1062: if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1063: #endif
1064: #endif
1065: if (b->hmatrix) {
1066: a->hmatrix = new HMatrix(*b->hmatrix);
1067: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068: }
1069: #if defined(PETSC_H2OPUS_USE_GPU)
1070: if (b->hmatrix_gpu) {
1071: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1072: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073: }
1074: #endif
1075: if (b->sf) {
1076: PetscObjectReference((PetscObject)b->sf);
1077: a->sf = b->sf;
1078: }
1079: if (b->h2opus_indexmap) {
1080: PetscObjectReference((PetscObject)b->h2opus_indexmap);
1081: a->h2opus_indexmap = b->h2opus_indexmap;
1082: }
1084: MatSetUp(A);
1085: MatSetUpMultiply_H2OPUS(A);
1086: if (op == MAT_COPY_VALUES) {
1087: A->assembled = PETSC_TRUE;
1088: a->orthogonal = b->orthogonal;
1089: #if defined(PETSC_H2OPUS_USE_GPU)
1090: A->offloadmask = B->offloadmask;
1091: #endif
1092: }
1093: #if defined(PETSC_H2OPUS_USE_GPU)
1094: iscpu = B->boundtocpu;
1095: #endif
1096: MatBindToCPU(A,iscpu);
1098: *nA = A;
1099: return(0);
1100: }
1102: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1103: {
1104: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
1105: PetscBool isascii;
1106: PetscErrorCode ierr;
1107: PetscMPIInt size;
1108: PetscViewerFormat format;
1111: PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1112: PetscViewerGetFormat(view,&format);
1113: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1114: if (isascii) {
1115: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1116: if (size == 1) {
1117: FILE *fp;
1118: PetscViewerASCIIGetPointer(view,&fp);
1119: dumpHMatrix(*h2opus->hmatrix,6,fp);
1120: }
1121: } else {
1122: PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat");
1123: PetscViewerASCIIPrintf(view," PointCloud dim %D\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1124: PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %D, eta %g\n",h2opus->leafsize,(double)h2opus->eta);
1125: if (!h2opus->kernel) {
1126: PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %D, samples %D, tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol);
1127: } else {
1128: PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %D\n",h2opus->basisord);
1129: }
1130: PetscViewerASCIIPrintf(view," Number of samples for norms %D\n",h2opus->norm_max_samples);
1131: if (size == 1) {
1132: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1133: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1134: #if defined(PETSC_HAVE_CUDA)
1135: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1136: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1137: #endif
1138: PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1139: #if defined(PETSC_HAVE_CUDA)
1140: PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1141: #endif
1142: } else {
1143: #if defined(PETSC_HAVE_CUDA)
1144: double matrix_mem[4] = {0.,0.,0.,0.};
1145: PetscMPIInt rsize = 4;
1146: #else
1147: double matrix_mem[2] = {0.,0.};
1148: PetscMPIInt rsize = 2;
1149: #endif
1150: #if defined(H2OPUS_USE_MPI)
1151: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1152: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1153: #if defined(PETSC_HAVE_CUDA)
1154: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1155: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1156: #endif
1157: #endif
1158: MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A));
1159: PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1160: #if defined(PETSC_HAVE_CUDA)
1161: PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1162: #endif
1163: }
1164: }
1165: }
1166: #if 0
1167: if (size == 1) {
1168: char filename[256];
1169: const char *name;
1171: PetscObjectGetName((PetscObject)A,&name);
1172: PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name);
1173: outputEps(*h2opus->hmatrix,filename);
1174: }
1175: #endif
1176: return(0);
1177: }
1179: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1180: {
1181: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
1182: PetscReal *gcoords;
1183: PetscInt N;
1184: MPI_Comm comm;
1185: PetscMPIInt size;
1186: PetscBool cong;
1190: PetscLayoutSetUp(A->rmap);
1191: PetscLayoutSetUp(A->cmap);
1192: PetscObjectGetComm((PetscObject)A,&comm);
1193: MatHasCongruentLayouts(A,&cong);
1194: if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1195: N = A->rmap->N;
1196: MPI_Comm_size(comm,&size);
1197: if (size > 1 && cdist) {
1198: PetscSF sf;
1199: MPI_Datatype dtype;
1201: MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1202: MPI_Type_commit(&dtype);
1204: PetscSFCreate(comm,&sf);
1205: PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1206: PetscMalloc1(spacedim*N,&gcoords);
1207: PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1208: PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1209: PetscSFDestroy(&sf);
1210: MPI_Type_free(&dtype);
1211: } else gcoords = (PetscReal*)coords;
1213: delete h2opus->ptcloud;
1214: delete h2opus->kernel;
1215: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1216: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1217: if (gcoords != coords) { PetscFree(gcoords); }
1218: A->preallocated = PETSC_TRUE;
1219: return(0);
1220: }
1222: #if defined(PETSC_H2OPUS_USE_GPU)
1223: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1224: {
1225: PetscMPIInt size;
1226: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1230: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1231: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1232: if (size > 1) {
1233: if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1234: #if defined(H2OPUS_USE_MPI)
1235: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1236: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1237: #endif
1238: } else {
1239: if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1240: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1241: else *a->hmatrix = *a->hmatrix_gpu;
1242: }
1243: delete a->hmatrix_gpu;
1244: delete a->dist_hmatrix_gpu;
1245: a->hmatrix_gpu = NULL;
1246: a->dist_hmatrix_gpu = NULL;
1247: A->offloadmask = PETSC_OFFLOAD_CPU;
1248: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1249: if (size > 1) {
1250: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1251: #if defined(H2OPUS_USE_MPI)
1252: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1253: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1254: #endif
1255: } else {
1256: if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1257: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1258: else *a->hmatrix_gpu = *a->hmatrix;
1259: }
1260: delete a->hmatrix;
1261: delete a->dist_hmatrix;
1262: a->hmatrix = NULL;
1263: a->dist_hmatrix = NULL;
1264: A->offloadmask = PETSC_OFFLOAD_GPU;
1265: }
1266: PetscFree(A->defaultvectype);
1267: if (!flg) {
1268: PetscStrallocpy(VECCUDA,&A->defaultvectype);
1269: } else {
1270: PetscStrallocpy(VECSTANDARD,&A->defaultvectype);
1271: }
1272: A->boundtocpu = flg;
1273: return(0);
1274: }
1275: #endif
1277: /*MC
1278: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1280: Options Database Keys:
1281: . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1283: Notes:
1284: H2Opus implements hierarchical matrices in the H^2 flavour.
1285: It supports CPU or NVIDIA GPUs.
1286: For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1287: In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1288: For details and additional references, see
1289: "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1290: available at https://arxiv.org/abs/2109.05451.
1292: Level: beginner
1294: .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1295: M*/
1296: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1297: {
1298: Mat_H2OPUS *a;
1300: PetscMPIInt size;
1303: #if defined(PETSC_H2OPUS_USE_GPU)
1304: PetscCUDAInitializeCheck();
1305: #endif
1306: PetscNewLog(A,&a);
1307: A->data = (void*)a;
1309: a->eta = 0.9;
1310: a->leafsize = 32;
1311: a->basisord = 4;
1312: a->max_rank = 64;
1313: a->bs = 32;
1314: a->rtol = 1.e-4;
1315: a->s = 1.0;
1316: a->norm_max_samples = 10;
1317: #if defined(H2OPUS_USE_MPI)
1318: h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1319: #else
1320: h2opusCreateHandle(&a->handle);
1321: #endif
1322: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1323: PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS);
1324: PetscMemzero(A->ops,sizeof(struct _MatOps));
1326: A->ops->destroy = MatDestroy_H2OPUS;
1327: A->ops->view = MatView_H2OPUS;
1328: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1329: A->ops->mult = MatMult_H2OPUS;
1330: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1331: A->ops->multadd = MatMultAdd_H2OPUS;
1332: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1333: A->ops->scale = MatScale_H2OPUS;
1334: A->ops->duplicate = MatDuplicate_H2OPUS;
1335: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1336: A->ops->norm = MatNorm_H2OPUS;
1337: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1338: #if defined(PETSC_H2OPUS_USE_GPU)
1339: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1340: #endif
1342: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS);
1343: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS);
1344: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS);
1345: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS);
1346: #if defined(PETSC_H2OPUS_USE_GPU)
1347: PetscFree(A->defaultvectype);
1348: PetscStrallocpy(VECCUDA,&A->defaultvectype);
1349: #endif
1350: return(0);
1351: }
1353: /*@C
1354: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1356: Input Parameter:
1357: . A - the matrix
1359: Level: intermediate
1361: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1362: */
1363: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1364: {
1366: PetscBool ish2opus;
1367: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1368: PetscMPIInt size;
1369: PetscBool boundtocpu = PETSC_TRUE;
1374: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1375: if (!ish2opus) return(0);
1376: if (a->orthogonal) return(0);
1377: HLibProfile::clear();
1378: PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0);
1379: #if defined(PETSC_H2OPUS_USE_GPU)
1380: boundtocpu = A->boundtocpu;
1381: #endif
1382: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1383: if (size > 1) {
1384: if (boundtocpu) {
1385: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1386: #if defined(H2OPUS_USE_MPI)
1387: distributed_horthog(*a->dist_hmatrix, a->handle);
1388: #endif
1389: #if defined(PETSC_H2OPUS_USE_GPU)
1390: A->offloadmask = PETSC_OFFLOAD_CPU;
1391: } else {
1392: if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1393: PetscLogGpuTimeBegin();
1394: #if defined(H2OPUS_USE_MPI)
1395: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1396: #endif
1397: PetscLogGpuTimeEnd();
1398: #endif
1399: }
1400: } else {
1401: #if defined(H2OPUS_USE_MPI)
1402: h2opusHandle_t handle = a->handle->handle;
1403: #else
1404: h2opusHandle_t handle = a->handle;
1405: #endif
1406: if (boundtocpu) {
1407: if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1408: horthog(*a->hmatrix, handle);
1409: #if defined(PETSC_H2OPUS_USE_GPU)
1410: A->offloadmask = PETSC_OFFLOAD_CPU;
1411: } else {
1412: if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1413: PetscLogGpuTimeBegin();
1414: horthog(*a->hmatrix_gpu, handle);
1415: PetscLogGpuTimeEnd();
1416: #endif
1417: }
1418: }
1419: a->orthogonal = PETSC_TRUE;
1420: { /* log flops */
1421: double gops,time,perf,dev;
1422: HLibProfile::getHorthogPerf(gops,time,perf,dev);
1423: #if defined(PETSC_H2OPUS_USE_GPU)
1424: if (boundtocpu) {
1425: PetscLogFlops(1e9*gops);
1426: } else {
1427: PetscLogGpuFlops(1e9*gops);
1428: }
1429: #else
1430: PetscLogFlops(1e9*gops);
1431: #endif
1432: }
1433: PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0);
1434: return(0);
1435: }
1437: /*@C
1438: MatH2OpusCompress - Compress a hierarchical matrix.
1440: Input Parameters:
1441: + A - the matrix
1442: - tol - the absolute truncation threshold
1444: Level: intermediate
1446: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1447: */
1448: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1449: {
1451: PetscBool ish2opus;
1452: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1453: PetscMPIInt size;
1454: PetscBool boundtocpu = PETSC_TRUE;
1459: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1460: if (!ish2opus) return(0);
1461: MatH2OpusOrthogonalize(A);
1462: HLibProfile::clear();
1463: PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0);
1464: #if defined(PETSC_H2OPUS_USE_GPU)
1465: boundtocpu = A->boundtocpu;
1466: #endif
1467: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1468: if (size > 1) {
1469: if (boundtocpu) {
1470: if (!a->dist_hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1471: #if defined(H2OPUS_USE_MPI)
1472: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1473: #endif
1474: #if defined(PETSC_H2OPUS_USE_GPU)
1475: A->offloadmask = PETSC_OFFLOAD_CPU;
1476: } else {
1477: if (!a->dist_hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1478: PetscLogGpuTimeBegin();
1479: #if defined(H2OPUS_USE_MPI)
1480: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1481: #endif
1482: PetscLogGpuTimeEnd();
1483: #endif
1484: }
1485: } else {
1486: #if defined(H2OPUS_USE_MPI)
1487: h2opusHandle_t handle = a->handle->handle;
1488: #else
1489: h2opusHandle_t handle = a->handle;
1490: #endif
1491: if (boundtocpu) {
1492: if (!a->hmatrix) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1493: hcompress(*a->hmatrix, tol, handle);
1494: #if defined(PETSC_H2OPUS_USE_GPU)
1495: A->offloadmask = PETSC_OFFLOAD_CPU;
1496: } else {
1497: if (!a->hmatrix_gpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1498: PetscLogGpuTimeBegin();
1499: hcompress(*a->hmatrix_gpu, tol, handle);
1500: PetscLogGpuTimeEnd();
1501: #endif
1502: }
1503: }
1504: { /* log flops */
1505: double gops,time,perf,dev;
1506: HLibProfile::getHcompressPerf(gops,time,perf,dev);
1507: #if defined(PETSC_H2OPUS_USE_GPU)
1508: if (boundtocpu) {
1509: PetscLogFlops(1e9*gops);
1510: } else {
1511: PetscLogGpuFlops(1e9*gops);
1512: }
1513: #else
1514: PetscLogFlops(1e9*gops);
1515: #endif
1516: }
1517: PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0);
1518: return(0);
1519: }
1521: /*@C
1522: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1524: Input Parameters:
1525: + A - the hierarchical matrix
1526: . B - the matrix to be sampled
1527: . bs - maximum number of samples to be taken concurrently
1528: - tol - relative tolerance for construction
1530: Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1532: Level: intermediate
1534: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1535: */
1536: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1537: {
1538: PetscBool ish2opus;
1545: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1546: if (ish2opus) {
1547: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1549: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1550: a->sampler->SetSamplingMat(B);
1551: if (bs > 0) a->bs = bs;
1552: if (tol > 0.) a->rtol = tol;
1553: delete a->kernel;
1554: }
1555: return(0);
1556: }
1558: /*@C
1559: MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1561: Input Parameters:
1562: + comm - MPI communicator
1563: . m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1564: . n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1565: . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1566: . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1567: . spacedim - dimension of the space coordinates
1568: . coords - coordinates of the points
1569: . cdist - whether or not coordinates are distributed
1570: . kernel - computational kernel (or NULL)
1571: . kernelctx - kernel context
1572: . eta - admissibility condition tolerance
1573: . leafsize - leaf size in cluster tree
1574: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1576: Output Parameter:
1577: . nA - matrix
1579: Options Database Keys:
1580: + -mat_h2opus_leafsize <PetscInt>
1581: . -mat_h2opus_eta <PetscReal>
1582: . -mat_h2opus_order <PetscInt> - Chebychev approximation order
1583: - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1585: Level: intermediate
1587: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1588: @*/
1589: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1590: {
1591: Mat A;
1592: Mat_H2OPUS *h2opus;
1593: #if defined(PETSC_H2OPUS_USE_GPU)
1594: PetscBool iscpu = PETSC_FALSE;
1595: #else
1596: PetscBool iscpu = PETSC_TRUE;
1597: #endif
1601: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1602: MatCreate(comm,&A);
1603: MatSetSizes(A,m,n,M,N);
1604: if (M != N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1605: MatSetType(A,MATH2OPUS);
1606: MatBindToCPU(A,iscpu);
1607: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx);
1609: h2opus = (Mat_H2OPUS*)A->data;
1610: if (eta > 0.) h2opus->eta = eta;
1611: if (leafsize > 0) h2opus->leafsize = leafsize;
1612: if (basisord > 0) h2opus->basisord = basisord;
1614: *nA = A;
1615: return(0);
1616: }
1618: /*@C
1619: MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1621: Input Parameters:
1622: + B - the matrix to be sampled
1623: . spacedim - dimension of the space coordinates
1624: . coords - coordinates of the points
1625: . cdist - whether or not coordinates are distributed
1626: . eta - admissibility condition tolerance
1627: . leafsize - leaf size in cluster tree
1628: . maxrank - maximum rank allowed
1629: . bs - maximum number of samples to be taken concurrently
1630: - rtol - relative tolerance for construction
1632: Output Parameter:
1633: . nA - matrix
1635: Options Database Keys:
1636: + -mat_h2opus_leafsize <PetscInt>
1637: . -mat_h2opus_eta <PetscReal>
1638: . -mat_h2opus_maxrank <PetscInt>
1639: . -mat_h2opus_samples <PetscInt>
1640: . -mat_h2opus_rtol <PetscReal>
1641: . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1642: . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1643: - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1645: Notes: not available in parallel
1647: Level: intermediate
1649: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1650: @*/
1651: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1652: {
1653: Mat A;
1654: Mat_H2OPUS *h2opus;
1655: MPI_Comm comm;
1656: PetscBool boundtocpu = PETSC_TRUE;
1668: PetscObjectGetComm((PetscObject)B,&comm);
1669: if (B->rmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1670: if (B->rmap->N != B->cmap->N) SETERRQ(comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1671: MatCreate(comm,&A);
1672: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1673: #if defined(PETSC_H2OPUS_USE_GPU)
1674: {
1675: PetscBool iscuda;
1676: VecType vtype;
1678: MatGetVecType(B,&vtype);
1679: PetscStrcmp(vtype,VECCUDA,&iscuda);
1680: if (!iscuda) {
1681: PetscStrcmp(vtype,VECSEQCUDA,&iscuda);
1682: if (!iscuda) {
1683: PetscStrcmp(vtype,VECMPICUDA,&iscuda);
1684: }
1685: }
1686: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1687: }
1688: #endif
1689: MatSetType(A,MATH2OPUS);
1690: MatBindToCPU(A,boundtocpu);
1691: if (spacedim) {
1692: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL);
1693: }
1694: MatPropagateSymmetryOptions(B,A);
1695: /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1697: h2opus = (Mat_H2OPUS*)A->data;
1698: h2opus->sampler = new PetscMatrixSampler(B);
1699: if (eta > 0.) h2opus->eta = eta;
1700: if (leafsize > 0) h2opus->leafsize = leafsize;
1701: if (maxrank > 0) h2opus->max_rank = maxrank;
1702: if (bs > 0) h2opus->bs = bs;
1703: if (rtol > 0.) h2opus->rtol = rtol;
1704: *nA = A;
1705: A->preallocated = PETSC_TRUE;
1706: return(0);
1707: }
1709: /*@C
1710: MatH2OpusGetIndexMap - Access reordering index set.
1712: Input Parameters:
1713: . A - the matrix
1715: Output Parameter:
1716: . indexmap - the index set for the reordering
1718: Level: intermediate
1720: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1721: @*/
1722: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1723: {
1724: PetscBool ish2opus;
1725: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1732: if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1733: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1734: if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1735: *indexmap = a->h2opus_indexmap;
1736: return(0);
1737: }
1739: /*@C
1740: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1742: Input Parameters:
1743: + A - the matrix
1744: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1745: - in - the vector to be mapped
1747: Output Parameter:
1748: . out - the newly created mapped vector
1750: Level: intermediate
1752: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1753: */
1754: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1755: {
1756: PetscBool ish2opus;
1757: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1758: PetscScalar *xin,*xout;
1759: PetscBool nm;
1768: if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1769: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1770: if (!ish2opus) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1771: nm = a->nativemult;
1772: MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc);
1773: MatCreateVecs(A,out,NULL);
1774: MatH2OpusSetNativeMult(A,nm);
1775: if (!a->sf) { /* same ordering */
1776: VecCopy(in,*out);
1777: return(0);
1778: }
1779: VecGetArrayRead(in,(const PetscScalar**)&xin);
1780: VecGetArrayWrite(*out,&xout);
1781: if (nativetopetsc) {
1782: PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1783: PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1784: } else {
1785: PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1786: PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1787: }
1788: VecRestoreArrayRead(in,(const PetscScalar**)&xin);
1789: VecRestoreArrayWrite(*out,&xout);
1790: return(0);
1791: }
1792: #endif