Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petsc/private/deviceimpl.h>
16: #include <petscsf.h>
18: /* math2opusutils */
19: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21: PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22: PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
25: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
27: /* Use GPU only if H2OPUS is configured for GPU */
28: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29: #define PETSC_H2OPUS_USE_GPU
30: #endif
31: #if defined(PETSC_H2OPUS_USE_GPU)
32: #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33: #else
34: #define MatH2OpusUpdateIfNeeded(A,B) 0
35: #endif
37: // TODO H2OPUS:
38: // DistributedHMatrix
39: // unsymmetric ?
40: // transpose for distributed_hgemv?
41: // clearData()
42: // Unify interface for sequential and parallel?
43: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44: //
45: template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46: {
47: private:
48: int dimension;
49: size_t num_points;
50: std::vector<T> pts;
52: public:
53: PetscPointCloud(int dim, size_t num_pts, const T coords[])
54: {
55: dim = dim > 0 ? dim : 1;
56: this->dimension = dim;
57: this->num_points = num_pts;
59: pts.resize(num_pts*dim);
60: if (coords) {
61: for (size_t n = 0; n < num_points; n++)
62: for (int i = 0; i < dim; i++)
63: pts[n*dim + i] = coords[n*dim + i];
64: } else {
65: PetscReal h = 1./(num_points - 1);
66: for (size_t n = 0; n < num_points; n++)
67: for (int i = 0; i < dim; i++)
68: pts[n*dim + i] = i*h;
69: }
70: }
72: PetscPointCloud(const PetscPointCloud<T>& other)
73: {
74: size_t N = other.dimension * other.num_points;
75: this->dimension = other.dimension;
76: this->num_points = other.num_points;
77: this->pts.resize(N);
78: for (size_t i = 0; i < N; i++)
79: this->pts[i] = other.pts[i];
80: }
82: int getDimension() const
83: {
84: return dimension;
85: }
87: size_t getDataSetSize() const
88: {
89: return num_points;
90: }
92: T getDataPoint(size_t idx, int dim) const
93: {
94: assert(dim < dimension && idx < num_points);
95: return pts[idx*dimension + dim];
96: }
98: void Print(std::ostream& out = std::cout)
99: {
100: out << "Dimension: " << dimension << std::endl;
101: out << "NumPoints: " << num_points << std::endl;
102: for (size_t n = 0; n < num_points; n++) {
103: for (int d = 0; d < dimension; d++)
104: out << pts[n*dimension + d] << " ";
105: out << std::endl;
106: }
107: }
108: };
110: template<class T> class PetscFunctionGenerator
111: {
112: private:
113: MatH2OpusKernel k;
114: int dim;
115: void *ctx;
117: public:
118: PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
119: PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
120: T operator()(PetscReal *pt1, PetscReal *pt2)
121: {
122: return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
123: }
124: };
126: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
128: /* just to not clutter the code */
129: #if !defined(H2OPUS_USE_GPU)
130: typedef HMatrix HMatrix_GPU;
131: #if defined(H2OPUS_USE_MPI)
132: typedef DistributedHMatrix DistributedHMatrix_GPU;
133: #endif
134: #endif
136: typedef struct {
137: #if defined(H2OPUS_USE_MPI)
138: distributedH2OpusHandle_t handle;
139: #else
140: h2opusHandle_t handle;
141: #endif
142: /* Sequential and parallel matrices are two different classes at the moment */
143: HMatrix *hmatrix;
144: #if defined(H2OPUS_USE_MPI)
145: DistributedHMatrix *dist_hmatrix;
146: #else
147: HMatrix *dist_hmatrix; /* just to not clutter the code */
148: #endif
149: /* May use permutations */
150: PetscSF sf;
151: PetscLayout h2opus_rmap, h2opus_cmap;
152: IS h2opus_indexmap;
153: thrust::host_vector<PetscScalar> *xx,*yy;
154: PetscInt xxs,yys;
155: PetscBool multsetup;
157: /* GPU */
158: HMatrix_GPU *hmatrix_gpu;
159: #if defined(H2OPUS_USE_MPI)
160: DistributedHMatrix_GPU *dist_hmatrix_gpu;
161: #else
162: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
163: #endif
164: #if defined(PETSC_H2OPUS_USE_GPU)
165: thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
166: PetscInt xxs_gpu,yys_gpu;
167: #endif
169: /* construction from matvecs */
170: PetscMatrixSampler* sampler;
171: PetscBool nativemult;
173: /* Admissibility */
174: PetscReal eta;
175: PetscInt leafsize;
177: /* for dof reordering */
178: PetscPointCloud<PetscReal> *ptcloud;
180: /* kernel for generating matrix entries */
181: PetscFunctionGenerator<PetscScalar> *kernel;
183: /* basis orthogonalized? */
184: PetscBool orthogonal;
186: /* customization */
187: PetscInt basisord;
188: PetscInt max_rank;
189: PetscInt bs;
190: PetscReal rtol;
191: PetscInt norm_max_samples;
192: PetscBool check_construction;
193: PetscBool hara_verbose;
195: /* keeps track of MatScale values */
196: PetscScalar s;
197: } Mat_H2OPUS;
199: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200: {
201: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
203: #if defined(H2OPUS_USE_MPI)
204: h2opusDestroyDistributedHandle(a->handle);
205: #else
206: h2opusDestroyHandle(a->handle);
207: #endif
208: delete a->dist_hmatrix;
209: delete a->hmatrix;
210: PetscSFDestroy(&a->sf);
211: PetscLayoutDestroy(&a->h2opus_rmap);
212: PetscLayoutDestroy(&a->h2opus_cmap);
213: ISDestroy(&a->h2opus_indexmap);
214: delete a->xx;
215: delete a->yy;
216: delete a->hmatrix_gpu;
217: delete a->dist_hmatrix_gpu;
218: #if defined(PETSC_H2OPUS_USE_GPU)
219: delete a->xx_gpu;
220: delete a->yy_gpu;
221: #endif
222: delete a->sampler;
223: delete a->ptcloud;
224: delete a->kernel;
225: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL);
226: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL);
227: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL);
228: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL);
229: PetscObjectChangeTypeName((PetscObject)A,NULL);
230: PetscFree(A->data);
231: return 0;
232: }
234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
237: PetscBool ish2opus;
241: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
242: if (ish2opus) {
243: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
244: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
245: PetscLayout t;
246: t = A->rmap;
247: A->rmap = a->h2opus_rmap;
248: a->h2opus_rmap = t;
249: t = A->cmap;
250: A->cmap = a->h2opus_cmap;
251: a->h2opus_cmap = t;
252: }
253: }
254: a->nativemult = nm;
255: }
256: return 0;
257: }
259: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
260: {
261: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
262: PetscBool ish2opus;
266: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
268: *nm = a->nativemult;
269: return 0;
270: }
272: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
273: {
274: PetscBool ish2opus;
275: PetscInt nmax = PETSC_DECIDE;
276: Mat_H2OPUS *a = NULL;
277: PetscBool mult = PETSC_FALSE;
279: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
280: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
281: a = (Mat_H2OPUS*)A->data;
283: nmax = a->norm_max_samples;
284: mult = a->nativemult;
285: MatH2OpusSetNativeMult(A,PETSC_TRUE);
286: } else {
287: PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL);
288: }
289: MatApproximateNorm_Private(A,normtype,nmax,n);
290: if (a) MatH2OpusSetNativeMult(A,mult);
291: return 0;
292: }
294: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
295: {
296: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
297: PetscInt n;
298: PetscBool boundtocpu = PETSC_TRUE;
300: #if defined(PETSC_H2OPUS_USE_GPU)
301: boundtocpu = A->boundtocpu;
302: #endif
303: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
304: if (boundtocpu) {
305: if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
306: if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
307: }
308: #if defined(PETSC_H2OPUS_USE_GPU)
309: if (!boundtocpu) {
310: if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
311: if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
312: }
313: #endif
314: return 0;
315: }
317: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
318: {
319: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
320: #if defined(H2OPUS_USE_MPI)
321: h2opusHandle_t handle = h2opus->handle->handle;
322: #else
323: h2opusHandle_t handle = h2opus->handle;
324: #endif
325: PetscBool boundtocpu = PETSC_TRUE;
326: PetscScalar *xx,*yy,*uxx,*uyy;
327: PetscInt blda,clda;
328: PetscMPIInt size;
329: PetscSF bsf,csf;
330: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
332: HLibProfile::clear();
333: #if defined(PETSC_H2OPUS_USE_GPU)
334: boundtocpu = A->boundtocpu;
335: #endif
336: MatDenseGetLDA(B,&blda);
337: MatDenseGetLDA(C,&clda);
338: if (usesf) {
339: PetscInt n;
341: MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf);
342: MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf);
344: MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N);
345: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
346: blda = n;
347: clda = n;
348: }
349: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
350: if (boundtocpu) {
351: MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
352: MatDenseGetArrayWrite(C,&yy);
353: if (usesf) {
354: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
355: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
356: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
357: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
358: } else {
359: uxx = xx;
360: uyy = yy;
361: }
362: if (size > 1) {
365: #if defined(H2OPUS_USE_MPI)
366: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
367: #endif
368: } else {
370: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
371: }
372: MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
373: if (usesf) {
374: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
375: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
376: }
377: MatDenseRestoreArrayWrite(C,&yy);
378: #if defined(PETSC_H2OPUS_USE_GPU)
379: } else {
380: PetscBool ciscuda,biscuda;
382: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
383: PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
384: if (!biscuda) {
385: MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
386: }
387: PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
388: if (!ciscuda) {
389: C->assembled = PETSC_TRUE;
390: MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
391: }
392: MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
393: MatDenseCUDAGetArrayWrite(C,&yy);
394: if (usesf) {
395: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
396: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
397: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
398: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
399: } else {
400: uxx = xx;
401: uyy = yy;
402: }
403: PetscLogGpuTimeBegin();
404: if (size > 1) {
407: #if defined(H2OPUS_USE_MPI)
408: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
409: #endif
410: } else {
412: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
413: }
414: PetscLogGpuTimeEnd();
415: MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
416: if (usesf) {
417: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
418: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
419: }
420: MatDenseCUDARestoreArrayWrite(C,&yy);
421: if (!biscuda) {
422: MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
423: }
424: if (!ciscuda) {
425: MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
426: }
427: #endif
428: }
429: { /* log flops */
430: double gops,time,perf,dev;
431: HLibProfile::getHgemvPerf(gops,time,perf,dev);
432: #if defined(PETSC_H2OPUS_USE_GPU)
433: if (boundtocpu) {
434: PetscLogFlops(1e9*gops);
435: } else {
436: PetscLogGpuFlops(1e9*gops);
437: }
438: #else
439: PetscLogFlops(1e9*gops);
440: #endif
441: }
442: return 0;
443: }
445: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
446: {
447: Mat_Product *product = C->product;
449: MatCheckProduct(C,1);
450: switch (product->type) {
451: case MATPRODUCT_AB:
452: MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C);
453: break;
454: case MATPRODUCT_AtB:
455: MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C);
456: break;
457: default:
458: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
459: }
460: return 0;
461: }
463: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
464: {
465: Mat_Product *product = C->product;
466: PetscBool cisdense;
467: Mat A,B;
469: MatCheckProduct(C,1);
470: A = product->A;
471: B = product->B;
472: switch (product->type) {
473: case MATPRODUCT_AB:
474: MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
475: MatSetBlockSizesFromMats(C,product->A,product->B);
476: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
477: if (!cisdense) MatSetType(C,((PetscObject)product->B)->type_name);
478: MatSetUp(C);
479: break;
480: case MATPRODUCT_AtB:
481: MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
482: MatSetBlockSizesFromMats(C,product->A,product->B);
483: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
484: if (!cisdense) MatSetType(C,((PetscObject)product->B)->type_name);
485: MatSetUp(C);
486: break;
487: default:
488: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
489: }
490: C->ops->productsymbolic = NULL;
491: C->ops->productnumeric = MatProductNumeric_H2OPUS;
492: return 0;
493: }
495: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
496: {
497: MatCheckProduct(C,1);
498: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
499: C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
500: }
501: return 0;
502: }
504: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
505: {
506: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
507: #if defined(H2OPUS_USE_MPI)
508: h2opusHandle_t handle = h2opus->handle->handle;
509: #else
510: h2opusHandle_t handle = h2opus->handle;
511: #endif
512: PetscBool boundtocpu = PETSC_TRUE;
513: PetscInt n;
514: PetscScalar *xx,*yy,*uxx,*uyy;
515: PetscMPIInt size;
516: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
518: HLibProfile::clear();
519: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
520: #if defined(PETSC_H2OPUS_USE_GPU)
521: boundtocpu = A->boundtocpu;
522: #endif
523: if (usesf) {
524: PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL);
525: } else n = A->rmap->n;
526: if (boundtocpu) {
527: VecGetArrayRead(x,(const PetscScalar**)&xx);
528: if (sy == 0.0) {
529: VecGetArrayWrite(y,&yy);
530: } else {
531: VecGetArray(y,&yy);
532: }
533: if (usesf) {
534: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
535: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
537: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
538: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
539: if (sy != 0.0) {
540: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
541: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
542: }
543: } else {
544: uxx = xx;
545: uyy = yy;
546: }
547: if (size > 1) {
550: #if defined(H2OPUS_USE_MPI)
551: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
552: #endif
553: } else {
555: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
556: }
557: VecRestoreArrayRead(x,(const PetscScalar**)&xx);
558: if (usesf) {
559: PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
560: PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
561: }
562: if (sy == 0.0) {
563: VecRestoreArrayWrite(y,&yy);
564: } else {
565: VecRestoreArray(y,&yy);
566: }
567: #if defined(PETSC_H2OPUS_USE_GPU)
568: } else {
569: VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
570: if (sy == 0.0) {
571: VecCUDAGetArrayWrite(y,&yy);
572: } else {
573: VecCUDAGetArray(y,&yy);
574: }
575: if (usesf) {
576: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
577: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
579: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
580: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
581: if (sy != 0.0) {
582: PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
583: PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
584: }
585: } else {
586: uxx = xx;
587: uyy = yy;
588: }
589: PetscLogGpuTimeBegin();
590: if (size > 1) {
593: #if defined(H2OPUS_USE_MPI)
594: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
595: #endif
596: } else {
598: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
599: }
600: PetscLogGpuTimeEnd();
601: VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
602: if (usesf) {
603: PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
604: PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
605: }
606: if (sy == 0.0) {
607: VecCUDARestoreArrayWrite(y,&yy);
608: } else {
609: VecCUDARestoreArray(y,&yy);
610: }
611: #endif
612: }
613: { /* log flops */
614: double gops,time,perf,dev;
615: HLibProfile::getHgemvPerf(gops,time,perf,dev);
616: #if defined(PETSC_H2OPUS_USE_GPU)
617: if (boundtocpu) {
618: PetscLogFlops(1e9*gops);
619: } else {
620: PetscLogGpuFlops(1e9*gops);
621: }
622: #else
623: PetscLogFlops(1e9*gops);
624: #endif
625: }
626: return 0;
627: }
629: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
630: {
631: PetscBool xiscuda,yiscuda;
633: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
634: PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
635: MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
636: MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE);
637: return 0;
638: }
640: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
641: {
642: PetscBool xiscuda,yiscuda;
644: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
645: PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
646: MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda);
647: MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE);
648: return 0;
649: }
651: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
652: {
653: PetscBool xiscuda,ziscuda;
655: VecCopy(y,z);
656: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
657: PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
658: MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
659: MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE);
660: return 0;
661: }
663: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
664: {
665: PetscBool xiscuda,ziscuda;
667: VecCopy(y,z);
668: PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
669: PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"");
670: MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda);
671: MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE);
672: return 0;
673: }
675: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
676: {
677: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
679: a->s *= s;
680: return 0;
681: }
683: static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
684: {
685: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
687: PetscOptionsHead(PetscOptionsObject,"H2OPUS options");
688: PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL);
689: PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
690: PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
691: PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
692: PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
693: PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL);
694: PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
695: PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
696: PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL);
697: PetscOptionsTail();
698: return 0;
699: }
701: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
703: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
704: {
705: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
706: Vec c;
707: PetscInt spacedim;
708: const PetscScalar *coords;
710: if (a->ptcloud) return 0;
711: PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c);
712: if (!c && a->sampler) {
713: Mat S = a->sampler->GetSamplingMat();
715: PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c);
716: }
717: if (!c) {
718: MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL);
719: } else {
720: VecGetArrayRead(c,&coords);
721: VecGetBlockSize(c,&spacedim);
722: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL);
723: VecRestoreArrayRead(c,&coords);
724: }
725: return 0;
726: }
728: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
729: {
730: MPI_Comm comm;
731: PetscMPIInt size;
732: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
733: PetscInt n = 0,*idx = NULL;
734: int *iidx = NULL;
735: PetscCopyMode own;
736: PetscBool rid;
738: if (a->multsetup) return 0;
739: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
740: PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
741: #if defined(PETSC_H2OPUS_USE_GPU)
742: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
743: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
744: a->xxs_gpu = 1;
745: a->yys_gpu = 1;
746: #endif
747: a->xx = new thrust::host_vector<PetscScalar>(n);
748: a->yy = new thrust::host_vector<PetscScalar>(n);
749: a->xxs = 1;
750: a->yys = 1;
751: } else {
752: IS is;
753: PetscObjectGetComm((PetscObject)A,&comm);
754: MPI_Comm_size(comm,&size);
755: if (!a->h2opus_indexmap) {
756: if (size > 1) {
758: #if defined(H2OPUS_USE_MPI)
759: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
760: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
761: #endif
762: } else {
763: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
764: n = a->hmatrix->u_basis_tree.index_map.size();
765: }
767: if (PetscDefined(USE_64BIT_INDICES)) {
768: PetscInt i;
770: own = PETSC_OWN_POINTER;
771: PetscMalloc1(n,&idx);
772: for (i=0;i<n;i++) idx[i] = iidx[i];
773: } else {
774: own = PETSC_COPY_VALUES;
775: idx = (PetscInt*)iidx;
776: }
777: ISCreateGeneral(comm,n,idx,own,&is);
778: ISSetPermutation(is);
779: ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view");
780: a->h2opus_indexmap = is;
781: }
782: ISGetLocalSize(a->h2opus_indexmap,&n);
783: ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
784: rid = (PetscBool)(n == A->rmap->n);
785: MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm);
786: if (rid) {
787: ISIdentity(a->h2opus_indexmap,&rid);
788: }
789: if (!rid) {
790: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
791: PetscLayoutCreate(comm,&a->h2opus_rmap);
792: PetscLayoutSetLocalSize(a->h2opus_rmap,n);
793: PetscLayoutSetUp(a->h2opus_rmap);
794: PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap);
795: }
796: PetscSFCreate(comm,&a->sf);
797: PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
798: PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view");
799: #if defined(PETSC_H2OPUS_USE_GPU)
800: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
801: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
802: a->xxs_gpu = 1;
803: a->yys_gpu = 1;
804: #endif
805: a->xx = new thrust::host_vector<PetscScalar>(n);
806: a->yy = new thrust::host_vector<PetscScalar>(n);
807: a->xxs = 1;
808: a->yys = 1;
809: }
810: ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx);
811: }
812: a->multsetup = PETSC_TRUE;
813: return 0;
814: }
816: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
817: {
818: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
819: #if defined(H2OPUS_USE_MPI)
820: h2opusHandle_t handle = a->handle->handle;
821: #else
822: h2opusHandle_t handle = a->handle;
823: #endif
824: PetscBool kernel = PETSC_FALSE;
825: PetscBool boundtocpu = PETSC_TRUE;
826: PetscBool samplingdone = PETSC_FALSE;
827: MPI_Comm comm;
828: PetscMPIInt size;
830: PetscObjectGetComm((PetscObject)A,&comm);
834: /* XXX */
835: a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
837: MPI_Comm_size(comm,&size);
838: /* TODO REUSABILITY of geometric construction */
839: delete a->hmatrix;
840: delete a->dist_hmatrix;
841: #if defined(PETSC_H2OPUS_USE_GPU)
842: delete a->hmatrix_gpu;
843: delete a->dist_hmatrix_gpu;
844: #endif
845: a->orthogonal = PETSC_FALSE;
847: /* TODO: other? */
848: H2OpusBoxCenterAdmissibility adm(a->eta);
850: PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0);
851: if (size > 1) {
852: #if defined(H2OPUS_USE_MPI)
853: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
854: #else
855: a->dist_hmatrix = NULL;
856: #endif
857: } else {
858: a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
859: }
860: MatH2OpusInferCoordinates_Private(A);
862: if (a->kernel) {
863: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
864: if (size > 1) {
866: #if defined(H2OPUS_USE_MPI)
867: buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
868: #endif
869: } else {
870: buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
871: }
872: kernel = PETSC_TRUE;
873: } else {
875: buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
876: }
877: MatSetUpMultiply_H2OPUS(A);
879: #if defined(PETSC_H2OPUS_USE_GPU)
880: boundtocpu = A->boundtocpu;
881: if (!boundtocpu) {
882: if (size > 1) {
884: #if defined(H2OPUS_USE_MPI)
885: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
886: #endif
887: } else {
888: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
889: }
890: }
891: #endif
892: if (size == 1) {
893: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
894: PetscReal Anorm;
895: bool verbose;
897: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL);
898: verbose = a->hara_verbose;
899: MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm);
900: if (a->hara_verbose) PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs);
901: if (a->sf && !a->nativemult) {
902: a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
903: }
904: a->sampler->SetStream(handle->getMainStream());
905: if (boundtocpu) {
906: a->sampler->SetGPUSampling(false);
907: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
908: #if defined(PETSC_H2OPUS_USE_GPU)
909: } else {
910: a->sampler->SetGPUSampling(true);
911: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
912: #endif
913: }
914: samplingdone = PETSC_TRUE;
915: }
916: }
917: #if defined(PETSC_H2OPUS_USE_GPU)
918: if (!boundtocpu) {
919: delete a->hmatrix;
920: delete a->dist_hmatrix;
921: a->hmatrix = NULL;
922: a->dist_hmatrix = NULL;
923: }
924: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
925: #endif
926: PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0);
928: if (!a->s) a->s = 1.0;
929: A->assembled = PETSC_TRUE;
931: if (samplingdone) {
932: PetscBool check = a->check_construction;
933: PetscBool checke = PETSC_FALSE;
935: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL);
936: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL);
937: if (check) {
938: Mat E,Ae;
939: PetscReal n1,ni,n2;
940: PetscReal n1A,niA,n2A;
941: void (*normfunc)(void);
943: Ae = a->sampler->GetSamplingMat();
944: MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
945: MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
946: MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
947: MatNorm(E,NORM_1,&n1);
948: MatNorm(E,NORM_INFINITY,&ni);
949: MatNorm(E,NORM_2,&n2);
950: if (checke) {
951: Mat eA,eE,eAe;
953: MatComputeOperator(A,MATAIJ,&eA);
954: MatComputeOperator(E,MATAIJ,&eE);
955: MatComputeOperator(Ae,MATAIJ,&eAe);
956: MatChop(eA,PETSC_SMALL);
957: MatChop(eE,PETSC_SMALL);
958: MatChop(eAe,PETSC_SMALL);
959: PetscObjectSetName((PetscObject)eA,"H2Mat");
960: MatView(eA,NULL);
961: PetscObjectSetName((PetscObject)eAe,"S");
962: MatView(eAe,NULL);
963: PetscObjectSetName((PetscObject)eE,"H2Mat - S");
964: MatView(eE,NULL);
965: MatDestroy(&eA);
966: MatDestroy(&eE);
967: MatDestroy(&eAe);
968: }
970: MatGetOperation(Ae,MATOP_NORM,&normfunc);
971: MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);
972: MatNorm(Ae,NORM_1,&n1A);
973: MatNorm(Ae,NORM_INFINITY,&niA);
974: MatNorm(Ae,NORM_2,&n2A);
975: n1A = PetscMax(n1A,PETSC_SMALL);
976: n2A = PetscMax(n2A,PETSC_SMALL);
977: niA = PetscMax(niA,PETSC_SMALL);
978: MatSetOperation(Ae,MATOP_NORM,normfunc);
979: PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
980: MatDestroy(&E);
981: }
982: a->sampler->SetSamplingMat(NULL);
983: }
984: return 0;
985: }
987: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
988: {
989: PetscMPIInt size;
990: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
992: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
994: else {
995: a->hmatrix->clearData();
996: #if defined(PETSC_H2OPUS_USE_GPU)
997: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
998: #endif
999: }
1000: return 0;
1001: }
1003: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1004: {
1005: Mat A;
1006: Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data;
1007: #if defined(PETSC_H2OPUS_USE_GPU)
1008: PetscBool iscpu = PETSC_FALSE;
1009: #else
1010: PetscBool iscpu = PETSC_TRUE;
1011: #endif
1012: MPI_Comm comm;
1014: PetscObjectGetComm((PetscObject)B,&comm);
1015: MatCreate(comm,&A);
1016: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1017: MatSetType(A,MATH2OPUS);
1018: MatPropagateSymmetryOptions(B,A);
1019: a = (Mat_H2OPUS*)A->data;
1021: a->eta = b->eta;
1022: a->leafsize = b->leafsize;
1023: a->basisord = b->basisord;
1024: a->max_rank = b->max_rank;
1025: a->bs = b->bs;
1026: a->rtol = b->rtol;
1027: a->norm_max_samples = b->norm_max_samples;
1028: if (op == MAT_COPY_VALUES) a->s = b->s;
1030: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1031: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1033: #if defined(H2OPUS_USE_MPI)
1034: if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1035: #if defined(PETSC_H2OPUS_USE_GPU)
1036: if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1037: #endif
1038: #endif
1039: if (b->hmatrix) {
1040: a->hmatrix = new HMatrix(*b->hmatrix);
1041: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1042: }
1043: #if defined(PETSC_H2OPUS_USE_GPU)
1044: if (b->hmatrix_gpu) {
1045: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1046: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1047: }
1048: #endif
1049: if (b->sf) {
1050: PetscObjectReference((PetscObject)b->sf);
1051: a->sf = b->sf;
1052: }
1053: if (b->h2opus_indexmap) {
1054: PetscObjectReference((PetscObject)b->h2opus_indexmap);
1055: a->h2opus_indexmap = b->h2opus_indexmap;
1056: }
1058: MatSetUp(A);
1059: MatSetUpMultiply_H2OPUS(A);
1060: if (op == MAT_COPY_VALUES) {
1061: A->assembled = PETSC_TRUE;
1062: a->orthogonal = b->orthogonal;
1063: #if defined(PETSC_H2OPUS_USE_GPU)
1064: A->offloadmask = B->offloadmask;
1065: #endif
1066: }
1067: #if defined(PETSC_H2OPUS_USE_GPU)
1068: iscpu = B->boundtocpu;
1069: #endif
1070: MatBindToCPU(A,iscpu);
1072: *nA = A;
1073: return 0;
1074: }
1076: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1077: {
1078: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
1079: PetscBool isascii;
1080: PetscMPIInt size;
1081: PetscViewerFormat format;
1083: PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1084: PetscViewerGetFormat(view,&format);
1085: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1086: if (isascii) {
1087: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1088: if (size == 1) {
1089: FILE *fp;
1090: PetscViewerASCIIGetPointer(view,&fp);
1091: dumpHMatrix(*h2opus->hmatrix,6,fp);
1092: }
1093: } else {
1094: PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat");
1095: PetscViewerASCIIPrintf(view," PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0);
1096: PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta);
1097: if (!h2opus->kernel) {
1098: PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol);
1099: } else {
1100: PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord);
1101: }
1102: PetscViewerASCIIPrintf(view," Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples);
1103: if (size == 1) {
1104: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1105: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1106: #if defined(PETSC_HAVE_CUDA)
1107: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1108: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1109: #endif
1110: PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1111: #if defined(PETSC_HAVE_CUDA)
1112: PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1113: #endif
1114: } else {
1115: #if defined(PETSC_HAVE_CUDA)
1116: double matrix_mem[4] = {0.,0.,0.,0.};
1117: PetscMPIInt rsize = 4;
1118: #else
1119: double matrix_mem[2] = {0.,0.};
1120: PetscMPIInt rsize = 2;
1121: #endif
1122: #if defined(H2OPUS_USE_MPI)
1123: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1124: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1125: #if defined(PETSC_HAVE_CUDA)
1126: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1127: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1128: #endif
1129: #endif
1130: MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A));
1131: PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]);
1132: #if defined(PETSC_HAVE_CUDA)
1133: PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]);
1134: #endif
1135: }
1136: }
1137: }
1138: #if 0
1139: if (size == 1) {
1140: char filename[256];
1141: const char *name;
1143: PetscObjectGetName((PetscObject)A,&name);
1144: PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name);
1145: outputEps(*h2opus->hmatrix,filename);
1146: }
1147: #endif
1148: return 0;
1149: }
1151: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1152: {
1153: Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data;
1154: PetscReal *gcoords;
1155: PetscInt N;
1156: MPI_Comm comm;
1157: PetscMPIInt size;
1158: PetscBool cong;
1160: PetscLayoutSetUp(A->rmap);
1161: PetscLayoutSetUp(A->cmap);
1162: PetscObjectGetComm((PetscObject)A,&comm);
1163: MatHasCongruentLayouts(A,&cong);
1165: N = A->rmap->N;
1166: MPI_Comm_size(comm,&size);
1167: if (spacedim > 0 && size > 1 && cdist) {
1168: PetscSF sf;
1169: MPI_Datatype dtype;
1171: MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1172: MPI_Type_commit(&dtype);
1174: PetscSFCreate(comm,&sf);
1175: PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1176: PetscMalloc1(spacedim*N,&gcoords);
1177: PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1178: PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1179: PetscSFDestroy(&sf);
1180: MPI_Type_free(&dtype);
1181: } else gcoords = (PetscReal*)coords;
1183: delete h2opus->ptcloud;
1184: delete h2opus->kernel;
1185: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1186: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1187: if (gcoords != coords) PetscFree(gcoords);
1188: A->preallocated = PETSC_TRUE;
1189: return 0;
1190: }
1192: #if defined(PETSC_H2OPUS_USE_GPU)
1193: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1194: {
1195: PetscMPIInt size;
1196: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1198: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1199: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1200: if (size > 1) {
1202: #if defined(H2OPUS_USE_MPI)
1203: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1204: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1205: #endif
1206: } else {
1208: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1209: else *a->hmatrix = *a->hmatrix_gpu;
1210: }
1211: delete a->hmatrix_gpu;
1212: delete a->dist_hmatrix_gpu;
1213: a->hmatrix_gpu = NULL;
1214: a->dist_hmatrix_gpu = NULL;
1215: A->offloadmask = PETSC_OFFLOAD_CPU;
1216: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1217: if (size > 1) {
1219: #if defined(H2OPUS_USE_MPI)
1220: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1221: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1222: #endif
1223: } else {
1225: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1226: else *a->hmatrix_gpu = *a->hmatrix;
1227: }
1228: delete a->hmatrix;
1229: delete a->dist_hmatrix;
1230: a->hmatrix = NULL;
1231: a->dist_hmatrix = NULL;
1232: A->offloadmask = PETSC_OFFLOAD_GPU;
1233: }
1234: PetscFree(A->defaultvectype);
1235: if (!flg) {
1236: PetscStrallocpy(VECCUDA,&A->defaultvectype);
1237: } else {
1238: PetscStrallocpy(VECSTANDARD,&A->defaultvectype);
1239: }
1240: A->boundtocpu = flg;
1241: return 0;
1242: }
1243: #endif
1245: /*MC
1246: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1248: Options Database Keys:
1249: . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1251: Notes:
1252: H2Opus implements hierarchical matrices in the H^2 flavour.
1253: It supports CPU or NVIDIA GPUs.
1254: For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1255: In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1256: For details and additional references, see
1257: "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1258: available at https://arxiv.org/abs/2109.05451.
1260: Level: beginner
1262: .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1263: M*/
1264: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1265: {
1266: Mat_H2OPUS *a;
1267: PetscMPIInt size;
1269: #if defined(PETSC_H2OPUS_USE_GPU)
1270: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1271: #endif
1272: PetscNewLog(A,&a);
1273: A->data = (void*)a;
1275: a->eta = 0.9;
1276: a->leafsize = 32;
1277: a->basisord = 4;
1278: a->max_rank = 64;
1279: a->bs = 32;
1280: a->rtol = 1.e-4;
1281: a->s = 1.0;
1282: a->norm_max_samples = 10;
1283: #if defined(H2OPUS_USE_MPI)
1284: h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1285: #else
1286: h2opusCreateHandle(&a->handle);
1287: #endif
1288: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1289: PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS);
1290: PetscMemzero(A->ops,sizeof(struct _MatOps));
1292: A->ops->destroy = MatDestroy_H2OPUS;
1293: A->ops->view = MatView_H2OPUS;
1294: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1295: A->ops->mult = MatMult_H2OPUS;
1296: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1297: A->ops->multadd = MatMultAdd_H2OPUS;
1298: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1299: A->ops->scale = MatScale_H2OPUS;
1300: A->ops->duplicate = MatDuplicate_H2OPUS;
1301: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1302: A->ops->norm = MatNorm_H2OPUS;
1303: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1304: #if defined(PETSC_H2OPUS_USE_GPU)
1305: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1306: #endif
1308: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS);
1309: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS);
1310: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS);
1311: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS);
1312: #if defined(PETSC_H2OPUS_USE_GPU)
1313: PetscFree(A->defaultvectype);
1314: PetscStrallocpy(VECCUDA,&A->defaultvectype);
1315: #endif
1316: return 0;
1317: }
1319: /*@C
1320: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1322: Input Parameter:
1323: . A - the matrix
1325: Level: intermediate
1327: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1328: @*/
1329: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1330: {
1331: PetscBool ish2opus;
1332: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1333: PetscMPIInt size;
1334: PetscBool boundtocpu = PETSC_TRUE;
1338: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1339: if (!ish2opus) return 0;
1340: if (a->orthogonal) return 0;
1341: HLibProfile::clear();
1342: PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0);
1343: #if defined(PETSC_H2OPUS_USE_GPU)
1344: boundtocpu = A->boundtocpu;
1345: #endif
1346: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1347: if (size > 1) {
1348: if (boundtocpu) {
1350: #if defined(H2OPUS_USE_MPI)
1351: distributed_horthog(*a->dist_hmatrix, a->handle);
1352: #endif
1353: #if defined(PETSC_H2OPUS_USE_GPU)
1354: A->offloadmask = PETSC_OFFLOAD_CPU;
1355: } else {
1357: PetscLogGpuTimeBegin();
1358: #if defined(H2OPUS_USE_MPI)
1359: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1360: #endif
1361: PetscLogGpuTimeEnd();
1362: #endif
1363: }
1364: } else {
1365: #if defined(H2OPUS_USE_MPI)
1366: h2opusHandle_t handle = a->handle->handle;
1367: #else
1368: h2opusHandle_t handle = a->handle;
1369: #endif
1370: if (boundtocpu) {
1372: horthog(*a->hmatrix, handle);
1373: #if defined(PETSC_H2OPUS_USE_GPU)
1374: A->offloadmask = PETSC_OFFLOAD_CPU;
1375: } else {
1377: PetscLogGpuTimeBegin();
1378: horthog(*a->hmatrix_gpu, handle);
1379: PetscLogGpuTimeEnd();
1380: #endif
1381: }
1382: }
1383: a->orthogonal = PETSC_TRUE;
1384: { /* log flops */
1385: double gops,time,perf,dev;
1386: HLibProfile::getHorthogPerf(gops,time,perf,dev);
1387: #if defined(PETSC_H2OPUS_USE_GPU)
1388: if (boundtocpu) {
1389: PetscLogFlops(1e9*gops);
1390: } else {
1391: PetscLogGpuFlops(1e9*gops);
1392: }
1393: #else
1394: PetscLogFlops(1e9*gops);
1395: #endif
1396: }
1397: PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0);
1398: return 0;
1399: }
1401: /*@C
1402: MatH2OpusCompress - Compress a hierarchical matrix.
1404: Input Parameters:
1405: + A - the matrix
1406: - tol - the absolute truncation threshold
1408: Level: intermediate
1410: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1411: @*/
1412: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1413: {
1414: PetscBool ish2opus;
1415: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1416: PetscMPIInt size;
1417: PetscBool boundtocpu = PETSC_TRUE;
1422: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1423: if (!ish2opus || tol <= 0.0) return 0;
1424: MatH2OpusOrthogonalize(A);
1425: HLibProfile::clear();
1426: PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0);
1427: #if defined(PETSC_H2OPUS_USE_GPU)
1428: boundtocpu = A->boundtocpu;
1429: #endif
1430: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1431: if (size > 1) {
1432: if (boundtocpu) {
1434: #if defined(H2OPUS_USE_MPI)
1435: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1436: #endif
1437: #if defined(PETSC_H2OPUS_USE_GPU)
1438: A->offloadmask = PETSC_OFFLOAD_CPU;
1439: } else {
1441: PetscLogGpuTimeBegin();
1442: #if defined(H2OPUS_USE_MPI)
1443: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1444: #endif
1445: PetscLogGpuTimeEnd();
1446: #endif
1447: }
1448: } else {
1449: #if defined(H2OPUS_USE_MPI)
1450: h2opusHandle_t handle = a->handle->handle;
1451: #else
1452: h2opusHandle_t handle = a->handle;
1453: #endif
1454: if (boundtocpu) {
1456: hcompress(*a->hmatrix, tol, handle);
1457: #if defined(PETSC_H2OPUS_USE_GPU)
1458: A->offloadmask = PETSC_OFFLOAD_CPU;
1459: } else {
1461: PetscLogGpuTimeBegin();
1462: hcompress(*a->hmatrix_gpu, tol, handle);
1463: PetscLogGpuTimeEnd();
1464: #endif
1465: }
1466: }
1467: { /* log flops */
1468: double gops,time,perf,dev;
1469: HLibProfile::getHcompressPerf(gops,time,perf,dev);
1470: #if defined(PETSC_H2OPUS_USE_GPU)
1471: if (boundtocpu) {
1472: PetscLogFlops(1e9*gops);
1473: } else {
1474: PetscLogGpuFlops(1e9*gops);
1475: }
1476: #else
1477: PetscLogFlops(1e9*gops);
1478: #endif
1479: }
1480: PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0);
1481: return 0;
1482: }
1484: /*@C
1485: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1487: Input Parameters:
1488: + A - the hierarchical matrix
1489: . B - the matrix to be sampled
1490: . bs - maximum number of samples to be taken concurrently
1491: - tol - relative tolerance for construction
1493: Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1495: Level: intermediate
1497: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1498: @*/
1499: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1500: {
1501: PetscBool ish2opus;
1508: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1509: if (ish2opus) {
1510: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1512: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1513: a->sampler->SetSamplingMat(B);
1514: if (bs > 0) a->bs = bs;
1515: if (tol > 0.) a->rtol = tol;
1516: delete a->kernel;
1517: }
1518: return 0;
1519: }
1521: /*@C
1522: MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1524: Input Parameters:
1525: + comm - MPI communicator
1526: . m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1527: . n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1528: . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1529: . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1530: . spacedim - dimension of the space coordinates
1531: . coords - coordinates of the points
1532: . cdist - whether or not coordinates are distributed
1533: . kernel - computational kernel (or NULL)
1534: . kernelctx - kernel context
1535: . eta - admissibility condition tolerance
1536: . leafsize - leaf size in cluster tree
1537: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1539: Output Parameter:
1540: . nA - matrix
1542: Options Database Keys:
1543: + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1544: . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1545: . -mat_h2opus_order <PetscInt> - Chebychev approximation order
1546: - -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1548: Level: intermediate
1550: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1551: @*/
1552: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1553: {
1554: Mat A;
1555: Mat_H2OPUS *h2opus;
1556: #if defined(PETSC_H2OPUS_USE_GPU)
1557: PetscBool iscpu = PETSC_FALSE;
1558: #else
1559: PetscBool iscpu = PETSC_TRUE;
1560: #endif
1563: MatCreate(comm,&A);
1564: MatSetSizes(A,m,n,M,N);
1566: MatSetType(A,MATH2OPUS);
1567: MatBindToCPU(A,iscpu);
1568: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx);
1570: h2opus = (Mat_H2OPUS*)A->data;
1571: if (eta > 0.) h2opus->eta = eta;
1572: if (leafsize > 0) h2opus->leafsize = leafsize;
1573: if (basisord > 0) h2opus->basisord = basisord;
1575: *nA = A;
1576: return 0;
1577: }
1579: /*@C
1580: MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1582: Input Parameters:
1583: + B - the matrix to be sampled
1584: . spacedim - dimension of the space coordinates
1585: . coords - coordinates of the points
1586: . cdist - whether or not coordinates are distributed
1587: . eta - admissibility condition tolerance
1588: . leafsize - leaf size in cluster tree
1589: . maxrank - maximum rank allowed
1590: . bs - maximum number of samples to be taken concurrently
1591: - rtol - relative tolerance for construction
1593: Output Parameter:
1594: . nA - matrix
1596: Options Database Keys:
1597: + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1598: . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1599: . -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1600: . -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1601: . -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1602: . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1603: . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1604: - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1606: Notes: not available in parallel
1608: Level: intermediate
1610: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1611: @*/
1612: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1613: {
1614: Mat A;
1615: Mat_H2OPUS *h2opus;
1616: MPI_Comm comm;
1617: PetscBool boundtocpu = PETSC_TRUE;
1628: PetscObjectGetComm((PetscObject)B,&comm);
1631: MatCreate(comm,&A);
1632: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1633: #if defined(PETSC_H2OPUS_USE_GPU)
1634: {
1635: PetscBool iscuda;
1636: VecType vtype;
1638: MatGetVecType(B,&vtype);
1639: PetscStrcmp(vtype,VECCUDA,&iscuda);
1640: if (!iscuda) {
1641: PetscStrcmp(vtype,VECSEQCUDA,&iscuda);
1642: if (!iscuda) {
1643: PetscStrcmp(vtype,VECMPICUDA,&iscuda);
1644: }
1645: }
1646: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1647: }
1648: #endif
1649: MatSetType(A,MATH2OPUS);
1650: MatBindToCPU(A,boundtocpu);
1651: if (spacedim) {
1652: MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL);
1653: }
1654: MatPropagateSymmetryOptions(B,A);
1657: h2opus = (Mat_H2OPUS*)A->data;
1658: h2opus->sampler = new PetscMatrixSampler(B);
1659: if (eta > 0.) h2opus->eta = eta;
1660: if (leafsize > 0) h2opus->leafsize = leafsize;
1661: if (maxrank > 0) h2opus->max_rank = maxrank;
1662: if (bs > 0) h2opus->bs = bs;
1663: if (rtol > 0.) h2opus->rtol = rtol;
1664: *nA = A;
1665: A->preallocated = PETSC_TRUE;
1666: return 0;
1667: }
1669: /*@C
1670: MatH2OpusGetIndexMap - Access reordering index set.
1672: Input Parameters:
1673: . A - the matrix
1675: Output Parameter:
1676: . indexmap - the index set for the reordering
1678: Level: intermediate
1680: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1681: @*/
1682: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1683: {
1684: PetscBool ish2opus;
1685: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1691: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1693: *indexmap = a->h2opus_indexmap;
1694: return 0;
1695: }
1697: /*@C
1698: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1700: Input Parameters:
1701: + A - the matrix
1702: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1703: - in - the vector to be mapped
1705: Output Parameter:
1706: . out - the newly created mapped vector
1708: Level: intermediate
1710: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1711: @*/
1712: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1713: {
1714: PetscBool ish2opus;
1715: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1716: PetscScalar *xin,*xout;
1717: PetscBool nm;
1725: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);
1727: nm = a->nativemult;
1728: MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc);
1729: MatCreateVecs(A,out,NULL);
1730: MatH2OpusSetNativeMult(A,nm);
1731: if (!a->sf) { /* same ordering */
1732: VecCopy(in,*out);
1733: return 0;
1734: }
1735: VecGetArrayRead(in,(const PetscScalar**)&xin);
1736: VecGetArrayWrite(*out,&xout);
1737: if (nativetopetsc) {
1738: PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1739: PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1740: } else {
1741: PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1742: PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE);
1743: }
1744: VecRestoreArrayRead(in,(const PetscScalar**)&xin);
1745: VecRestoreArrayWrite(*out,&xout);
1746: return 0;
1747: }
1749: /*@C
1750: MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1752: Input Parameters:
1753: + A - the hierarchical matrix
1754: . s - the scaling factor
1755: . U - the dense low-rank update matrix
1756: - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1758: Notes: The U and V matrices must be in dense format
1760: Level: intermediate
1762: .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE
1763: @*/
1764: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1765: {
1766: PetscBool flg;
1773: if (V) {
1776: }
1779: if (!V) V = U;
1781: if (!U->cmap->N) return 0;
1782: PetscLayoutCompare(U->rmap,A->rmap,&flg);
1784: PetscLayoutCompare(V->rmap,A->cmap,&flg);
1786: PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg);
1787: if (flg) {
1788: Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1789: const PetscScalar *u,*v,*uu,*vv;
1790: PetscInt ldu,ldv;
1791: PetscMPIInt size;
1792: #if defined(H2OPUS_USE_MPI)
1793: h2opusHandle_t handle = a->handle->handle;
1794: #else
1795: h2opusHandle_t handle = a->handle;
1796: #endif
1797: PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1798: PetscSF usf,vsf;
1800: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1802: PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0);
1803: PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"");
1805: PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"");
1807: MatDenseGetLDA(U,&ldu);
1808: MatDenseGetLDA(V,&ldv);
1809: MatBoundToCPU(A,&flg);
1810: if (usesf) {
1811: PetscInt n;
1813: MatDenseGetH2OpusVectorSF(U,a->sf,&usf);
1814: MatDenseGetH2OpusVectorSF(V,a->sf,&vsf);
1815: MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N);
1816: PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL);
1817: ldu = n;
1818: ldv = n;
1819: }
1820: if (flg) {
1822: MatDenseGetArrayRead(U,&u);
1823: MatDenseGetArrayRead(V,&v);
1824: if (usesf) {
1825: vv = MatH2OpusGetThrustPointer(*a->yy);
1826: PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1827: PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1828: if (U != V) {
1829: uu = MatH2OpusGetThrustPointer(*a->xx);
1830: PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1831: PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1832: } else uu = vv;
1833: } else { uu = u; vv = v; }
1834: hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1835: MatDenseRestoreArrayRead(U,&u);
1836: MatDenseRestoreArrayRead(V,&v);
1837: } else {
1838: #if defined(PETSC_H2OPUS_USE_GPU)
1839: PetscBool flgU, flgV;
1842: PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"");
1843: if (flgU) MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U);
1844: PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"");
1845: if (flgV) MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V);
1846: MatDenseCUDAGetArrayRead(U,&u);
1847: MatDenseCUDAGetArrayRead(V,&v);
1848: if (usesf) {
1849: vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1850: PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1851: PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE);
1852: if (U != V) {
1853: uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1854: PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1855: PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE);
1856: } else uu = vv;
1857: } else { uu = u; vv = v; }
1858: #else
1859: SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1860: #endif
1861: hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1862: #if defined(PETSC_H2OPUS_USE_GPU)
1863: MatDenseCUDARestoreArrayRead(U,&u);
1864: MatDenseCUDARestoreArrayRead(V,&v);
1865: if (flgU) MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U);
1866: if (flgV) MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V);
1867: #endif
1868: }
1869: PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0);
1870: a->orthogonal = PETSC_FALSE;
1871: }
1872: return 0;
1873: }
1874: #endif