Actual source code: mathara.cu
1: #include <hara.h>
2: #include <distributed/distributed_hara_handle.h>
3: #include <distributed/distributed_hmatrix.h>
4: #include <distributed/distributed_geometric_construction.h>
5: #include <distributed/distributed_hgemv.h>
6: #include <petsc/private/matimpl.h>
7: #include <petscsf.h>
9: #define MatHaraGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
11: // TODO HARA:
12: // MatDuplicate buggy with commsize > 1
13: // kernel needs (global?) id of points (issues with Chebyshev points and coupling matrix computation)
14: // unsymmetrix DistributedHMatrix (transpose for distributed_hgemv?)
15: // Unify interface for sequential and parallel?
16: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
17: // Fix includes:
18: // - everything under hara/ dir (except for hara.h)
19: // - fix kblas includes
20: // - namespaces?
21: // Fix naming convention DistributedHMatrix_GPU vs GPU_HMatrix
22: // Diagnostics? FLOPS, MEMORY USAGE IN PARALLEL
23: // Why do we need to template the admissibility condition in the hmatrix construction?
24: //
25: template<typename T, int D>
26: struct PetscPointCloud
27: {
28: static const int Dim = D;
29: typedef T ElementType;
31: struct Point
32: {
33: T x[D];
34: Point() {
35: for (int i = 0; i < D; i++) x[i] = 0;
36: }
37: Point(const T xx[]) {
38: for (int i = 0; i < D; i++) x[i] = xx[i];
39: }
40: };
42: std::vector<Point> pts;
44: // Must return the number of data points
45: inline size_t kdtree_get_point_count() const { return pts.size(); }
47: // Returns the dim'th component of the idx'th point in the class:
48: inline T kdtree_get_pt(const size_t idx, int dim) const
49: {
50: return pts[idx].x[dim];
51: }
53: inline T get_pt(const size_t idx, int dim) const
54: {
55: return kdtree_get_pt(idx, dim);
56: }
58: inline bool kdtree_ignore_point(const size_t idx) const { return false; }
60: // Optional bounding-box computation: return false to default to a standard bbox computation loop.
61: // Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
62: // Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
63: template <class BBOX>
64: bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }
66: PetscPointCloud(PetscInt,const T[]);
67: PetscPointCloud(PetscPointCloud&);
68: };
70: template<typename T, int D>
71: PetscPointCloud<T,D>::PetscPointCloud(PetscInt n, const T coords[])
72: {
73: this->pts.resize(n);
74: for (PetscInt i = 0; i < n; i++) {
75: Point p(coords);
76: this->pts[i] = p;
77: coords += D;
78: }
79: }
81: template<typename T, int D>
82: PetscPointCloud<T,D>::PetscPointCloud(PetscPointCloud<T,D>& other)
83: {
84: this->pts = other.pts;
85: }
87: template <typename T>
88: using PetscPointCloud3D = PetscPointCloud<T,3>;
89: template <typename T>
90: using PetscPointCloud2D = PetscPointCloud<T,2>;
91: template <typename T>
92: using PetscPointCloud1D = PetscPointCloud<T,1>;
94: template<typename T, int Dim>
95: struct PetscFunctionGenerator
96: {
97: private:
98: MatHaraKernel k;
99: void *ctx;
101: public:
102: PetscFunctionGenerator(MatHaraKernel k, void* ctx) { this->k = k; this->ctx = ctx; }
103: PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->ctx = other.ctx; }
104: T operator()(PetscReal pt1[Dim], PetscReal pt2[Dim])
105: {
106: return (T)(this->k ? (*this->k)(Dim,pt1,pt2,this->ctx) : 0);
107: }
108: };
110: template <typename T>
111: using PetscFunctionGenerator3D = PetscFunctionGenerator<T,3>;
112: template <typename T>
113: using PetscFunctionGenerator2D = PetscFunctionGenerator<T,2>;
114: template <typename T>
115: using PetscFunctionGenerator1D = PetscFunctionGenerator<T,1>;
117: #include <../src/mat/impls/hara/matharasampler.hpp>
119: typedef struct {
120: distributedHaraHandle_t handle;
122: /* two different classes at the moment */
123: HMatrix *hmatrix;
124: DistributedHMatrix *dist_hmatrix;
126: /* May use permutations */
127: PetscSF sf;
128: PetscLayout hara_rmap;
129: thrust::host_vector<PetscScalar> *xx,*yy;
130: PetscInt xxs,yys;
131: PetscBool multsetup;
133: /* GPU */
134: #if defined(HARA_USE_GPU)
135: GPU_HMatrix *hmatrix_gpu;
136: DistributedHMatrix_GPU *dist_hmatrix_gpu;
137: thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
138: PetscInt xxs_gpu,yys_gpu;
139: #endif
141: /* construction from matvecs */
142: PetscMatrixSampler* sampler;
144: /* Admissibility */
145: PetscReal eta;
146: PetscInt leafsize;
148: /* for dof reordering */
149: PetscInt spacedim;
150: PetscPointCloud1D<PetscReal> *ptcloud1;
151: PetscPointCloud2D<PetscReal> *ptcloud2;
152: PetscPointCloud3D<PetscReal> *ptcloud3;
154: /* kernel for generating matrix entries */
155: PetscFunctionGenerator1D<PetscScalar> *kernel1;
156: PetscFunctionGenerator2D<PetscScalar> *kernel2;
157: PetscFunctionGenerator3D<PetscScalar> *kernel3;
159: /* customization */
160: PetscInt basisord;
161: PetscInt max_rank;
162: PetscInt bs;
163: PetscReal rtol;
164: PetscInt norm_max_samples;
165: PetscBool check_construction;
167: /* keeps track of MatScale values */
168: PetscScalar s;
169: } Mat_HARA;
171: static PetscErrorCode MatDestroy_HARA(Mat A)
172: {
173: Mat_HARA *a = (Mat_HARA*)A->data;
177: haraDestroyDistributedHandle(a->handle);
178: delete a->hmatrix;
179: delete a->dist_hmatrix;
180: PetscSFDestroy(&a->sf);
181: PetscLayoutDestroy(&a->hara_rmap);
182: delete a->xx;
183: delete a->yy;
184: #if defined(HARA_USE_GPU)
185: delete a->hmatrix_gpu;
186: delete a->dist_hmatrix_gpu;
187: delete a->xx_gpu;
188: delete a->yy_gpu;
189: #endif
190: delete a->sampler;
191: delete a->ptcloud1;
192: delete a->ptcloud2;
193: delete a->ptcloud3;
194: delete a->kernel1;
195: delete a->kernel2;
196: delete a->kernel3;
197: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",NULL);
198: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",NULL);
199: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",NULL);
200: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",NULL);
201: PetscObjectChangeTypeName((PetscObject)A,NULL);
202: PetscFree(A->data);
203: return(0);
204: }
206: static PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
207: {
208: PetscSF rankssf;
209: const PetscSFNode *iremote;
210: PetscSFNode *viremote,*rremotes;
211: const PetscInt *ilocal;
212: PetscInt *vilocal = NULL,*ldrs;
213: const PetscMPIInt *ranks;
214: PetscMPIInt *sranks;
215: PetscInt nranks,nr,nl,vnr,vnl,i,v,j,maxl;
216: MPI_Comm comm;
217: PetscErrorCode ierr;
220: if (nv == 1) {
221: PetscObjectReference((PetscObject)sf);
222: *vsf = sf;
223: return(0);
224: }
225: PetscObjectGetComm((PetscObject)sf,&comm);
226: PetscSFGetGraph(sf,&nr,&nl,&ilocal,&iremote);
227: PetscSFGetLeafRange(sf,NULL,&maxl);
228: maxl += 1;
229: if (ldl == PETSC_DECIDE) ldl = maxl;
230: if (ldr == PETSC_DECIDE) ldr = nr;
231: if (ldr < nr) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldr,nr);
232: if (ldl < maxl) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldl,maxl);
233: vnr = nr*nv;
234: vnl = nl*nv;
235: PetscMalloc1(vnl,&viremote);
236: if (ilocal) {
237: PetscMalloc1(vnl,&vilocal);
238: }
240: /* TODO: Should this special SF be available, e.g.
241: PetscSFGetRanksSF or similar? */
242: PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
243: PetscMalloc1(nranks,&sranks);
244: PetscArraycpy(sranks,ranks,nranks);
245: PetscSortMPIInt(nranks,sranks);
246: PetscMalloc1(nranks,&rremotes);
247: for (i=0;i<nranks;i++) {
248: rremotes[i].rank = sranks[i];
249: rremotes[i].index = 0;
250: }
251: PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&rankssf);
252: PetscSFSetGraph(rankssf,1,nranks,NULL,PETSC_OWN_POINTER,rremotes,PETSC_OWN_POINTER);
253: PetscMalloc1(nranks,&ldrs);
254: PetscSFBcastBegin(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
255: PetscSFBcastEnd(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
256: PetscSFDestroy(&rankssf);
258: j = -1;
259: for (i=0;i<nl;i++) {
260: const PetscInt r = iremote[i].rank;
261: const PetscInt ii = iremote[i].index;
263: if (j < 0 || sranks[j] != r) {
264: PetscFindMPIInt(r,nranks,sranks,&j);
265: }
266: if (j < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unable to locate neighbor rank %D",r);
267: for (v=0;v<nv;v++) {
268: viremote[v*nl + i].rank = r;
269: viremote[v*nl + i].index = v*ldrs[j] + ii;
270: if (ilocal) vilocal[v*nl + i] = v*ldl + ilocal[i];
271: }
272: }
273: PetscFree(sranks);
274: PetscFree(ldrs);
275: PetscSFCreate(comm,vsf);
276: PetscSFSetGraph(*vsf,vnr,vnl,vilocal,PETSC_OWN_POINTER,viremote,PETSC_OWN_POINTER);
277: return(0);
278: }
280: static PetscErrorCode VecSign(Vec v, Vec s)
281: {
282: const PetscScalar *av;
283: PetscScalar *as;
284: PetscInt i,n;
285: PetscErrorCode ierr;
290: VecGetArrayRead(v,&av);
291: VecGetArrayWrite(s,&as);
292: VecGetLocalSize(s,&n);
293: VecGetLocalSize(v,&i);
294: if (i != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Invalid local sizes %D != %D",i,n);
295: for (i=0;i<n;i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
296: VecRestoreArrayWrite(s,&as);
297: VecRestoreArrayRead(v,&av);
298: return(0);
299: }
301: /* these are approximate norms */
302: /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
303: NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
304: static PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal* n)
305: {
306: Vec x,y,w,z;
307: PetscReal normz,adot;
308: PetscScalar dot;
309: PetscInt i,j,N,jold = -1;
313: switch (normtype) {
314: case NORM_INFINITY:
315: case NORM_1:
316: if (normsamples < 0) normsamples = 10; /* pure guess */
317: if (normtype == NORM_INFINITY) {
318: Mat B;
319: MatCreateTranspose(A,&B);
320: A = B;
321: } else {
322: PetscObjectReference((PetscObject)A);
323: }
324: MatCreateVecs(A,&x,&y);
325: MatCreateVecs(A,&z,&w);
326: VecGetSize(x,&N);
327: VecSet(x,1./N);
328: VecSetOption(x,VEC_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE);
329: *n = 0.0;
330: for (i = 0; i < normsamples; i++) {
331: MatMult(A,x,y);
332: VecSign(y,w);
333: MatMultTranspose(A,w,z);
334: VecNorm(z,NORM_INFINITY,&normz);
335: VecDot(x,z,&dot);
336: adot = PetscAbsScalar(dot);
337: PetscInfo4(A,"%s norm it %D -> (%g %g)\n",NormTypes[normtype],i,(double)normz,(double)adot);
338: if (normz <= adot && i > 0) {
339: VecNorm(y,NORM_1,n);
340: break;
341: }
342: VecSet(x,0.);
343: VecMax(z,&j,&normz);
344: if (j == jold) {
345: VecNorm(y,NORM_1,n);
346: PetscInfo2(A,"%s norm it %D -> breakdown (j==jold)\n",NormTypes[normtype],i);
347: break;
348: }
349: jold = j;
350: VecSetValue(x,j,1.0,INSERT_VALUES);
351: VecAssemblyBegin(x);
352: VecAssemblyEnd(x);
353: }
354: MatDestroy(&A);
355: VecDestroy(&x);
356: VecDestroy(&w);
357: VecDestroy(&y);
358: VecDestroy(&z);
359: break;
360: case NORM_2:
361: if (normsamples < 0) normsamples = 20; /* pure guess */
362: MatCreateVecs(A,&x,&y);
363: MatCreateVecs(A,&z,NULL);
364: VecSetRandom(x,NULL);
365: VecNormalize(x,NULL);
366: *n = 0.0;
367: for (i = 0; i < normsamples; i++) {
368: MatMult(A,x,y);
369: VecNormalize(y,n);
370: MatMultTranspose(A,y,z);
371: VecNorm(z,NORM_2,&normz);
372: VecDot(x,z,&dot);
373: adot = PetscAbsScalar(dot);
374: PetscInfo5(A,"%s norm it %D -> %g (%g %g)\n",NormTypes[normtype],i,(double)*n,(double)normz,(double)adot);
375: if (normz <= adot) break;
376: if (i < normsamples - 1) {
377: Vec t;
379: VecNormalize(z,NULL);
380: t = x;
381: x = z;
382: z = t;
383: }
384: }
385: VecDestroy(&x);
386: VecDestroy(&y);
387: VecDestroy(&z);
388: break;
389: default:
390: SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"%s norm not supported",NormTypes[normtype]);
391: }
392: PetscInfo3(A,"%s norm %g computed in %D iterations\n",NormTypes[normtype],(double)*n,i);
393: return(0);
394: }
396: PETSC_EXTERN PetscErrorCode MatNorm_HARA(Mat A, NormType normtype, PetscReal* n)
397: {
399: PetscBool ishara;
400: PetscInt nmax = PETSC_DECIDE;
403: PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);
404: if (ishara) {
405: Mat_HARA *a = (Mat_HARA*)A->data;
407: nmax = a->norm_max_samples;
408: }
409: MatApproximateNorm_Private(A,normtype,nmax,n);
410: return(0);
411: }
413: static PetscErrorCode MatMultNKernel_HARA(Mat A, PetscBool transA, Mat B, Mat C)
414: {
415: Mat_HARA *hara = (Mat_HARA*)A->data;
416: haraHandle_t handle = hara->handle->handle;
417: PetscBool boundtocpu = PETSC_TRUE;
418: PetscScalar *xx,*yy,*uxx,*uyy;
419: PetscInt blda,clda;
420: PetscSF bsf,csf;
424: #if defined(HARA_USE_GPU)
425: boundtocpu = A->boundtocpu;
426: #endif
427: MatDenseGetLDA(B,&blda);
428: MatDenseGetLDA(C,&clda);
429: if (hara->sf) {
430: PetscInt n;
432: PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
433: PetscObjectQuery((PetscObject)B,"_mathara_vectorsf",(PetscObject*)&bsf);
434: if (!bsf) {
435: PetscSFGetVectorSF(hara->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);
436: PetscObjectCompose((PetscObject)B,"_mathara_vectorsf",(PetscObject)bsf);
437: PetscObjectDereference((PetscObject)bsf);
438: }
439: PetscObjectQuery((PetscObject)C,"_mathara_vectorsf",(PetscObject*)&csf);
440: if (!csf) {
441: PetscSFGetVectorSF(hara->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);
442: PetscObjectCompose((PetscObject)C,"_mathara_vectorsf",(PetscObject)csf);
443: PetscObjectDereference((PetscObject)csf);
444: }
445: blda = n;
446: clda = n;
447: }
448: if (!boundtocpu) {
449: #if defined(HARA_USE_GPU)
450: PetscBool ciscuda,biscuda;
452: if (hara->sf) {
453: PetscInt n;
455: PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
456: if (hara->xxs_gpu < B->cmap->n) { hara->xx_gpu->resize(n*B->cmap->N); hara->xxs_gpu = B->cmap->N; }
457: if (hara->yys_gpu < B->cmap->n) { hara->yy_gpu->resize(n*B->cmap->N); hara->yys_gpu = B->cmap->N; }
458: }
459: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
460: PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
461: if (!biscuda) {
462: MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);
463: }
464: PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
465: if (!ciscuda) {
466: C->assembled = PETSC_TRUE;
467: MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);
468: }
469: MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);
470: MatDenseCUDAGetArrayWrite(C,&yy);
471: if (hara->sf) {
472: uxx = MatHaraGetThrustPointer(*hara->xx_gpu);
473: uyy = MatHaraGetThrustPointer(*hara->yy_gpu);
474: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
475: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
476: } else {
477: uxx = xx;
478: uyy = yy;
479: }
480: if (hara->dist_hmatrix_gpu) {
481: if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
482: distributed_hgemv(/* transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
483: } else {
484: hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
485: }
486: MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);
487: if (hara->sf) {
488: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
489: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
490: }
491: MatDenseCUDARestoreArrayWrite(C,&yy);
492: if (!biscuda) {
493: MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);
494: }
495: if (!ciscuda) {
496: MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);
497: }
498: #endif
499: } else {
500: if (hara->sf) {
501: PetscInt n;
503: PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
504: if (hara->xxs < B->cmap->n) { hara->xx->resize(n*B->cmap->N); hara->xxs = B->cmap->N; }
505: if (hara->yys < B->cmap->n) { hara->yy->resize(n*B->cmap->N); hara->yys = B->cmap->N; }
506: }
507: MatDenseGetArrayRead(B,(const PetscScalar**)&xx);
508: MatDenseGetArrayWrite(C,&yy);
509: if (hara->sf) {
510: uxx = MatHaraGetThrustPointer(*hara->xx);
511: uyy = MatHaraGetThrustPointer(*hara->yy);
512: PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
513: PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
514: } else {
515: uxx = xx;
516: uyy = yy;
517: }
518: if (hara->dist_hmatrix) {
519: if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
520: distributed_hgemv(/*transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
521: } else {
522: hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
523: }
524: MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);
525: if (hara->sf) {
526: PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
527: PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
528: }
529: MatDenseRestoreArrayWrite(C,&yy);
530: }
531: return(0);
532: }
534: static PetscErrorCode MatProductNumeric_HARA(Mat C)
535: {
536: Mat_Product *product = C->product;
540: MatCheckProduct(C,1);
541: switch (product->type) {
542: case MATPRODUCT_AB:
543: MatMultNKernel_HARA(product->A,PETSC_FALSE,product->B,C);
544: break;
545: case MATPRODUCT_AtB:
546: MatMultNKernel_HARA(product->A,PETSC_TRUE,product->B,C);
547: break;
548: default:
549: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
550: }
551: return(0);
552: }
554: static PetscErrorCode MatProductSymbolic_HARA(Mat C)
555: {
557: Mat_Product *product = C->product;
558: PetscBool cisdense;
559: Mat A,B;
562: MatCheckProduct(C,1);
563: A = product->A;
564: B = product->B;
565: switch (product->type) {
566: case MATPRODUCT_AB:
567: MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);
568: MatSetBlockSizesFromMats(C,product->A,product->B);
569: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
570: if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
571: MatSetUp(C);
572: break;
573: case MATPRODUCT_AtB:
574: MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);
575: MatSetBlockSizesFromMats(C,product->A,product->B);
576: PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");
577: if (!cisdense) { MatSetType(C,((PetscObject)product->B)->type_name); }
578: MatSetUp(C);
579: break;
580: default:
581: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
582: }
583: C->ops->productsymbolic = NULL;
584: C->ops->productnumeric = MatProductNumeric_HARA;
585: return(0);
586: }
588: static PetscErrorCode MatProductSetFromOptions_HARA(Mat C)
589: {
591: MatCheckProduct(C,1);
592: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
593: C->ops->productsymbolic = MatProductSymbolic_HARA;
594: }
595: return(0);
596: }
598: static PetscErrorCode MatMultKernel_HARA(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
599: {
600: Mat_HARA *hara = (Mat_HARA*)A->data;
601: haraHandle_t handle = hara->handle->handle;
602: PetscBool boundtocpu = PETSC_TRUE;
603: PetscInt n;
604: PetscScalar *xx,*yy,*uxx,*uyy;
608: #if defined(HARA_USE_GPU)
609: boundtocpu = A->boundtocpu;
610: #endif
611: if (hara->sf) {
612: PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);
613: } else n = A->rmap->n;
614: if (!boundtocpu) {
615: #if defined(HARA_USE_GPU)
616: VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);
617: if (sy == 0.0) {
618: VecCUDAGetArrayWrite(y,&yy);
619: } else {
620: VecCUDAGetArray(y,&yy);
621: }
622: if (hara->sf) {
623: uxx = MatHaraGetThrustPointer(*hara->xx_gpu);
624: uyy = MatHaraGetThrustPointer(*hara->yy_gpu);
626: PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
627: PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
628: if (sy != 0.0) {
629: PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
630: PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
631: }
632: } else {
633: uxx = xx;
634: uyy = yy;
635: }
636: if (hara->dist_hmatrix_gpu) {
637: if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
638: distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, hara->handle);
639: } else {
640: hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
641: }
642: VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);
643: if (hara->sf) {
644: PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
645: PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
646: }
647: if (sy == 0.0) {
648: VecCUDARestoreArrayWrite(y,&yy);
649: } else {
650: VecCUDARestoreArray(y,&yy);
651: }
652: #endif
653: } else {
654: VecGetArrayRead(x,(const PetscScalar**)&xx);
655: if (sy == 0.0) {
656: VecGetArrayWrite(y,&yy);
657: } else {
658: VecGetArray(y,&yy);
659: }
660: if (hara->sf) {
661: uxx = MatHaraGetThrustPointer(*hara->xx);
662: uyy = MatHaraGetThrustPointer(*hara->yy);
664: PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
665: PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE);
666: if (sy != 0.0) {
667: PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
668: PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE);
669: }
670: } else {
671: uxx = xx;
672: uyy = yy;
673: }
674: if (hara->dist_hmatrix) {
675: if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
676: distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, n, sy, uyy, n, 1, hara->handle);
677: } else {
678: hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, n, sy, uyy, n, 1, handle);
679: }
680: VecRestoreArrayRead(x,(const PetscScalar**)&xx);
681: if (hara->sf) {
682: PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
683: PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE);
684: }
685: if (sy == 0.0) {
686: VecRestoreArrayWrite(y,&yy);
687: } else {
688: VecRestoreArray(y,&yy);
689: }
690: }
691: return(0);
692: }
694: static PetscErrorCode MatMultTranspose_HARA(Mat A, Vec x, Vec y)
695: {
699: MatMultKernel_HARA(A,x,0.0,y,PETSC_TRUE);
700: return(0);
701: }
703: static PetscErrorCode MatMult_HARA(Mat A, Vec x, Vec y)
704: {
708: MatMultKernel_HARA(A,x,0.0,y,PETSC_FALSE);
709: return(0);
710: }
712: static PetscErrorCode MatMultTransposeAdd_HARA(Mat A, Vec x, Vec y, Vec z)
713: {
717: VecCopy(y,z);
718: MatMultKernel_HARA(A,x,1.0,z,PETSC_TRUE);
719: return(0);
720: }
722: static PetscErrorCode MatMultAdd_HARA(Mat A, Vec x, Vec y, Vec z)
723: {
727: VecCopy(y,z);
728: MatMultKernel_HARA(A,x,1.0,z,PETSC_FALSE);
729: return(0);
730: }
732: static PetscErrorCode MatScale_HARA(Mat A, PetscScalar s)
733: {
734: Mat_HARA *a = (Mat_HARA*)A->data;
737: a->s *= s;
738: return(0);
739: }
741: static PetscErrorCode MatSetFromOptions_HARA(PetscOptionItems *PetscOptionsObject,Mat A)
742: {
743: Mat_HARA *a = (Mat_HARA*)A->data;
747: PetscOptionsHead(PetscOptionsObject,"HARA options");
748: PetscOptionsInt("-mat_hara_leafsize","Leaf size when constructed from kernel",NULL,a->leafsize,&a->leafsize,NULL);
749: PetscOptionsReal("-mat_hara_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);
750: PetscOptionsInt("-mat_hara_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);
751: PetscOptionsInt("-mat_hara_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);
752: PetscOptionsInt("-mat_hara_samples","Number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);
753: PetscOptionsReal("-mat_hara_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);
754: PetscOptionsBool("-mat_hara_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);
755: PetscOptionsTail();
756: return(0);
757: }
759: static PetscErrorCode MatHaraSetCoords_HARA(Mat,PetscInt,const PetscReal[],MatHaraKernel,void*);
761: static PetscErrorCode MatHaraInferCoordinates_Private(Mat A)
762: {
763: Mat_HARA *a = (Mat_HARA*)A->data;
764: Vec c;
765: PetscInt spacedim;
766: const PetscScalar *coords;
767: PetscErrorCode ierr;
770: if (a->spacedim) return(0);
771: PetscObjectQuery((PetscObject)A,"__mathara_coords",(PetscObject*)&c);
772: if (!c && a->sampler) {
773: Mat S = a->sampler->GetSamplingMat();
775: PetscObjectQuery((PetscObject)S,"__mathara_coords",(PetscObject*)&c);
776: if (!c) {
777: PetscBool ishara;
779: PetscObjectTypeCompare((PetscObject)S,MATHARA,&ishara);
780: if (ishara) {
781: Mat_HARA *s = (Mat_HARA*)S->data;
783: a->spacedim = s->spacedim;
784: if (s->ptcloud1) {
785: a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*s->ptcloud1);
786: } else if (s->ptcloud2) {
787: a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*s->ptcloud2);
788: } else if (s->ptcloud3) {
789: a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*s->ptcloud3);
790: }
791: }
792: }
793: }
794: if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
795: VecGetArrayRead(c,&coords);
796: VecGetBlockSize(c,&spacedim);
797: MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);
798: VecRestoreArrayRead(c,&coords);
799: return(0);
800: }
802: static PetscErrorCode MatSetUpMultiply_HARA(Mat A)
803: {
804: MPI_Comm comm;
805: PetscMPIInt size;
807: Mat_HARA *a = (Mat_HARA*)A->data;
808: IS is;
809: PetscInt n,*idx;
810: int *iidx;
811: PetscCopyMode own;
812: PetscBool rid;
815: if (a->multsetup) return(0);
816: PetscObjectGetComm((PetscObject)A,&comm);
817: MPI_Comm_size(comm,&size);
818: if (size > 1) {
819: iidx = MatHaraGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
820: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
821: } else {
822: n = a->hmatrix->u_basis_tree.index_map.size();
823: iidx = MatHaraGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
824: }
825: if (PetscDefined(USE_64BIT_INDICES)) {
826: PetscInt i;
828: own = PETSC_OWN_POINTER;
829: PetscMalloc1(n,&idx);
830: for (i=0;i<n;i++) idx[i] = iidx[i];
831: } else {
832: own = PETSC_USE_POINTER;
833: idx = iidx;
834: }
835: ISCreateGeneral(comm,n,idx,own,&is);
836: ISIdentity(is,&rid);
837: if (!rid) {
838: PetscSFCreate(comm,&a->sf);
839: PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);
840: #if defined(HARA_USE_GPU)
841: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
842: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
843: a->xxs_gpu = 1;
844: a->yys_gpu = 1;
845: #endif
846: a->xx = new thrust::host_vector<PetscScalar>(n);
847: a->yy = new thrust::host_vector<PetscScalar>(n);
848: a->xxs = 1;
849: a->yys = 1;
850: }
851: ISDestroy(&is);
852: a->multsetup = PETSC_TRUE;
853: return(0);
854: }
856: static PetscErrorCode MatAssemblyEnd_HARA(Mat A, MatAssemblyType asstype)
857: {
858: Mat_HARA *a = (Mat_HARA*)A->data;
859: haraHandle_t handle = a->handle->handle;
860: PetscBool kernel = PETSC_FALSE;
861: PetscBool boundtocpu = PETSC_TRUE;
862: MPI_Comm comm;
863: PetscMPIInt size;
867: PetscObjectGetComm((PetscObject)A,&comm);
868: MPI_Comm_size(comm,&size);
869: /* TODO REUSABILITY of geometric construction */
870: delete a->hmatrix;
871: delete a->dist_hmatrix;
872: #if defined(HARA_USE_GPU)
873: delete a->hmatrix_gpu;
874: delete a->dist_hmatrix_gpu;
875: #endif
876: /* TODO: other? */
877: BoxCenterAdmissibility<Hara_Real,1> adm1(a->eta,a->leafsize);
878: BoxCenterAdmissibility<Hara_Real,2> adm2(a->eta,a->leafsize);
879: BoxCenterAdmissibility<Hara_Real,3> adm3(a->eta,a->leafsize);
880: if (size > 1) {
881: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
882: } else {
883: a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
884: }
885: MatHaraInferCoordinates_Private(A);
886: switch (a->spacedim) {
887: case 1:
888: if (!a->ptcloud1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
889: if (a->kernel1) {
890: kernel = PETSC_TRUE;
891: if (size > 1) {
892: buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
893: } else {
894: buildHMatrix(*a->hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord,a->basisord);
895: }
896: } else {
897: if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
898: buildHMatrixStructure(*a->hmatrix,*a->ptcloud1,adm1,a->leafsize,0,0);
899: }
900: break;
901: case 2:
902: if (!a->ptcloud2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
903: if (a->kernel2) {
904: kernel = PETSC_TRUE;
905: if (size > 1) {
906: buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
907: } else {
908: buildHMatrix(*a->hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord,a->basisord);
909: }
910: } else {
911: if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
912: buildHMatrixStructure(*a->hmatrix,*a->ptcloud2,adm2,a->leafsize,0,0);
913: }
914: break;
915: case 3:
916: if (!a->ptcloud3) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
917: if (a->kernel3) {
918: kernel = PETSC_TRUE;
919: if (size > 1) {
920: buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
921: } else {
922: buildHMatrix(*a->hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord,a->basisord);
923: }
924: } else {
925: if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
926: buildHMatrixStructure(*a->hmatrix,*a->ptcloud3,adm3,a->leafsize,0,0);
927: }
928: break;
929: default:
930: SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",a->spacedim);
931: }
933: MatSetUpMultiply_HARA(A);
935: #if defined(HARA_USE_GPU)
936: boundtocpu = A->boundtocpu;
937: if (!boundtocpu) {
938: if (size > 1) {
939: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
940: } else {
941: a->hmatrix_gpu = new GPU_HMatrix(*a->hmatrix);
942: }
943: }
944: #endif
945: if (size == 1) {
946: if (!kernel && a->sampler) {
947: PetscReal Anorm;
948: bool verbose = false;
950: MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,PETSC_DECIDE,&Anorm);
951: if (boundtocpu) {
952: a->sampler->SetGPUSampling(false);
953: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
954: #if defined(HARA_USE_GPU)
955: } else {
956: a->sampler->SetGPUSampling(true);
957: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
958: #endif
959: }
960: }
961: }
962: #if defined(HARA_USE_GPU)
963: if (kernel) A->offloadmask = PETSC_OFFLOAD_BOTH;
964: else A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
965: #endif
967: if (!a->s) a->s = 1.0;
968: A->assembled = PETSC_TRUE;
970: if (a->sampler) {
971: PetscBool check = a->check_construction;
973: PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_hara_check",&check,NULL);
974: if (check) {
975: Mat E,Ae;
976: PetscReal n1,ni,n2;
977: PetscReal n1A,niA,n2A;
978: void (*normfunc)(void);
980: Ae = a->sampler->GetSamplingMat();
981: MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);
982: MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_HARA);
983: MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);
984: MatNorm(E,NORM_1,&n1);
985: MatNorm(E,NORM_INFINITY,&ni);
986: MatNorm(E,NORM_2,&n2);
988: MatGetOperation(Ae,MATOP_NORM,&normfunc);
989: MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_HARA);
990: MatNorm(Ae,NORM_1,&n1A);
991: MatNorm(Ae,NORM_INFINITY,&niA);
992: MatNorm(Ae,NORM_2,&n2A);
993: n1A = PetscMax(n1A,PETSC_SMALL);
994: n2A = PetscMax(n2A,PETSC_SMALL);
995: niA = PetscMax(niA,PETSC_SMALL);
996: MatSetOperation(Ae,MATOP_NORM,normfunc);
997: PetscPrintf(PetscObjectComm((PetscObject)A),"MATHARA construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
998: MatDestroy(&E);
999: }
1000: }
1001: return(0);
1002: }
1004: static PetscErrorCode MatZeroEntries_HARA(Mat A)
1005: {
1007: PetscMPIInt size;
1008: Mat_HARA *a = (Mat_HARA*)A->data;
1011: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1012: if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1013: else {
1014: a->hmatrix->clearData();
1015: #if defined(HARA_USE_GPU)
1016: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017: #endif
1018: }
1019: return(0);
1020: }
1022: static PetscErrorCode MatDuplicate_HARA(Mat B, MatDuplicateOption op, Mat *nA)
1023: {
1024: Mat A;
1025: Mat_HARA *a, *b = (Mat_HARA*)B->data;
1026: #if defined(PETSC_HAVE_CUDA)
1027: PetscBool iscpu = PETSC_FALSE;
1028: #else
1029: PetscBool iscpu = PETSC_TRUE;
1030: #endif
1032: MPI_Comm comm;
1035: PetscObjectGetComm((PetscObject)B,&comm);
1036: MatCreate(comm,&A);
1037: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1038: MatSetType(A,MATHARA);
1039: MatPropagateSymmetryOptions(B,A);
1041: a = (Mat_HARA*)A->data;
1042: a->s = b->s;
1043: a->spacedim = b->spacedim;
1044: if (b->ptcloud1) {
1045: a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*b->ptcloud1);
1046: } else if (b->ptcloud2) {
1047: a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*b->ptcloud2);
1048: } else if (b->ptcloud3) {
1049: a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*b->ptcloud3);
1050: }
1051: if (op == MAT_COPY_VALUES) {
1052: if (b->kernel1) {
1053: a->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(*b->kernel1);
1054: } else if (b->kernel2) {
1055: a->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(*b->kernel2);
1056: } else if (b->kernel3) {
1057: a->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(*b->kernel3);
1058: }
1059: }
1061: if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1062: #if defined(HARA_USE_GPU)
1063: if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1064: #endif
1065: if (b->hmatrix) {
1066: a->hmatrix = new HMatrix(*b->hmatrix);
1067: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068: }
1069: #if defined(HARA_USE_GPU)
1070: if (b->hmatrix_gpu) {
1071: a->hmatrix_gpu = new GPU_HMatrix(*b->hmatrix_gpu);
1072: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073: }
1074: #endif
1076: MatSetUp(A);
1077: MatSetUpMultiply_HARA(A);
1078: if (op == MAT_COPY_VALUES) {
1079: A->assembled = PETSC_TRUE;
1080: #if defined(PETSC_HAVE_CUDA)
1081: iscpu = B->boundtocpu;
1082: #endif
1083: }
1084: MatBindToCPU(A,iscpu);
1086: *nA = A;
1087: return(0);
1088: }
1090: static PetscErrorCode MatView_HARA(Mat A, PetscViewer view)
1091: {
1092: Mat_HARA *hara = (Mat_HARA*)A->data;
1093: PetscBool isascii;
1094: PetscErrorCode ierr;
1095: PetscMPIInt size;
1096: PetscViewerFormat format;
1099: PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);
1100: PetscViewerGetFormat(view,&format);
1101: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1102: if (isascii) {
1103: PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",hara->sampler ? "Mat" : (hara->spacedim ? "Kernel" : "None"));
1104: PetscViewerASCIIPrintf(view," PointCloud dim %D\n",hara->spacedim);
1105: PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %D, eta %g\n",hara->leafsize,(double)hara->eta);
1106: if (hara->sampler) {
1107: PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %D, samples %D, tolerance %g\n",hara->max_rank,hara->bs,(double)hara->rtol);
1108: } else {
1109: PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %D\n",hara->basisord);
1110: }
1111: if (size == 1) {
1112: double dense_mem_cpu = hara->hmatrix ? hara->hmatrix->getDenseMemoryUsage() : 0;
1113: double low_rank_cpu = hara->hmatrix ? hara->hmatrix->getLowRankMemoryUsage() : 0;
1114: #if defined(HARA_USE_GPU)
1115: double dense_mem_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getDenseMemoryUsage() : 0;
1116: double low_rank_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1117: #endif
1118: PetscViewerASCIIPrintf(view," Memory consumption (CPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);
1119: #if defined(HARA_USE_GPU)
1120: PetscViewerASCIIPrintf(view," Memory consumption (GPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);
1121: #endif
1122: }
1123: }
1124: return(0);
1125: }
1127: static PetscErrorCode MatHaraSetCoords_HARA(Mat A, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel, void *kernelctx)
1128: {
1129: Mat_HARA *hara = (Mat_HARA*)A->data;
1130: PetscReal *gcoords;
1131: PetscInt N;
1132: MPI_Comm comm;
1133: PetscMPIInt size;
1134: PetscBool cong;
1138: PetscLayoutSetUp(A->rmap);
1139: PetscLayoutSetUp(A->cmap);
1140: PetscObjectGetComm((PetscObject)A,&comm);
1141: MatHasCongruentLayouts(A,&cong);
1142: if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1143: N = A->rmap->N;
1144: MPI_Comm_size(comm,&size);
1145: if (size > 1) {
1146: PetscSF sf;
1147: MPI_Datatype dtype;
1149: MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);
1150: MPI_Type_commit(&dtype);
1152: PetscSFCreate(comm,&sf);
1153: PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);
1154: PetscMalloc1(spacedim*N,&gcoords);
1155: PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE);
1156: PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE);
1157: PetscSFDestroy(&sf);
1158: MPI_Type_free(&dtype);
1159: } else gcoords = (PetscReal*)coords;
1161: delete hara->ptcloud1;
1162: delete hara->ptcloud2;
1163: delete hara->ptcloud3;
1164: delete hara->kernel1;
1165: delete hara->kernel2;
1166: delete hara->kernel3;
1167: hara->spacedim = spacedim;
1168: switch (spacedim) {
1169: case 1:
1170: hara->ptcloud1 = new PetscPointCloud1D<PetscReal>(N,gcoords);
1171: if (kernel) hara->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(kernel,kernelctx);
1172: break;
1173: case 2:
1174: hara->ptcloud2 = new PetscPointCloud2D<PetscReal>(N,gcoords);
1175: if (kernel) hara->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(kernel,kernelctx);
1176: break;
1177: case 3:
1178: hara->ptcloud3 = new PetscPointCloud3D<PetscReal>(N,gcoords);
1179: if (kernel) hara->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(kernel,kernelctx);
1180: break;
1181: default:
1182: SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",hara->spacedim);
1183: }
1184: if (gcoords != coords) { PetscFree(gcoords); }
1185: A->preallocated = PETSC_TRUE;
1186: return(0);
1187: }
1189: PETSC_EXTERN PetscErrorCode MatCreate_HARA(Mat A)
1190: {
1191: Mat_HARA *a;
1193: PetscMPIInt size;
1196: PetscNewLog(A,&a);
1197: A->data = (void*)a;
1199: a->eta = 0.9;
1200: a->leafsize = 32;
1201: a->basisord = 4;
1202: a->max_rank = 64;
1203: a->bs = 32;
1204: a->rtol = 1.e-4;
1205: a->s = 1.0;
1206: a->norm_max_samples = 10;
1207: haraCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1209: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1210: PetscObjectChangeTypeName((PetscObject)A,MATHARA);
1211: PetscMemzero(A->ops,sizeof(struct _MatOps));
1213: A->ops->destroy = MatDestroy_HARA;
1214: A->ops->view = MatView_HARA;
1215: A->ops->assemblyend = MatAssemblyEnd_HARA;
1216: A->ops->mult = MatMult_HARA;
1217: A->ops->multtranspose = MatMultTranspose_HARA;
1218: A->ops->multadd = MatMultAdd_HARA;
1219: A->ops->multtransposeadd = MatMultTransposeAdd_HARA;
1220: A->ops->scale = MatScale_HARA;
1221: A->ops->duplicate = MatDuplicate_HARA;
1222: A->ops->setfromoptions = MatSetFromOptions_HARA;
1223: A->ops->norm = MatNorm_HARA;
1224: A->ops->zeroentries = MatZeroEntries_HARA;
1226: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",MatProductSetFromOptions_HARA);
1227: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",MatProductSetFromOptions_HARA);
1228: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",MatProductSetFromOptions_HARA);
1229: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",MatProductSetFromOptions_HARA);
1230: #if defined(PETSC_HAVE_CUDA)
1231: PetscFree(A->defaultvectype);
1232: PetscStrallocpy(VECCUDA,&A->defaultvectype);
1233: #endif
1234: return(0);
1235: }
1237: PetscErrorCode MatHaraSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1238: {
1239: PetscBool ishara;
1246: PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);
1247: if (ishara) {
1248: Mat_HARA *a = (Mat_HARA*)A->data;
1251: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1252: a->sampler->SetSamplingMat(B);
1253: if (bs > 0) a->bs = bs;
1254: if (tol > 0.) a->rtol = tol;
1255: delete a->kernel1;
1256: delete a->kernel2;
1257: delete a->kernel3;
1258: }
1259: return(0);
1260: }
1262: PetscErrorCode MatCreateHaraFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel,void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1263: {
1264: Mat A;
1265: Mat_HARA *hara;
1266: #if defined(PETSC_HAVE_CUDA)
1267: PetscBool iscpu = PETSC_FALSE;
1268: #else
1269: PetscBool iscpu = PETSC_TRUE;
1270: #endif
1274: MatCreate(comm,&A);
1275: MatSetSizes(A,m,n,M,N);
1276: MatSetType(A,MATHARA);
1277: MatBindToCPU(A,iscpu);
1278: MatHaraSetCoords_HARA(A,spacedim,coords,kernel,kernelctx);
1280: hara = (Mat_HARA*)A->data;
1281: if (eta > 0.) hara->eta = eta;
1282: if (leafsize > 0) hara->leafsize = leafsize;
1283: if (basisord > 0) hara->basisord = basisord;
1285: *nA = A;
1286: return(0);
1287: }
1289: PetscErrorCode MatCreateHaraFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1290: {
1291: Mat A;
1292: Mat_HARA *hara;
1293: MPI_Comm comm;
1294: #if defined(PETSC_HAVE_CUDA)
1295: PetscBool iscpu = PETSC_FALSE;
1296: #else
1297: PetscBool iscpu = PETSC_TRUE;
1298: #endif
1305: PetscObjectGetComm((PetscObject)B,&comm);
1306: MatCreate(comm,&A);
1307: MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);
1308: MatSetType(A,MATHARA);
1309: MatBindToCPU(A,iscpu);
1310: if (spacedim) {
1311: MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);
1312: }
1313: MatPropagateSymmetryOptions(B,A);
1314: /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1316: hara = (Mat_HARA*)A->data;
1317: hara->sampler = new PetscMatrixSampler(B);
1318: if (eta > 0.) hara->eta = eta;
1319: if (leafsize > 0) hara->leafsize = leafsize;
1320: if (maxrank > 0) hara->max_rank = maxrank;
1321: if (bs > 0) hara->bs = bs;
1322: if (rtol > 0.) hara->rtol = rtol;
1324: *nA = A;
1325: A->preallocated = PETSC_TRUE;
1326: return(0);
1327: }