Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_CXX_COMPLEX_FIX
3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
5: #include <petscconf.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9: #include <thrust/advance.h>
10: #include <petscsf.h>
12: struct VecCUDAEquals
13: {
14: template <typename Tuple>
15: __host__ __device__
16: void operator()(Tuple t)
17: {
18: thrust::get<1>(t) = thrust::get<0>(t);
19: }
20: };
22: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
23: {
24: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
25: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
26: PetscInt n = cusp->coo_nd + cusp->coo_no;
27: PetscErrorCode ierr;
28: cudaError_t cerr;
31: if (cusp->coo_p && v) {
32: thrust::device_ptr<const PetscScalar> d_v;
33: THRUSTARRAY *w = NULL;
35: if (isCudaMem(v)) {
36: d_v = thrust::device_pointer_cast(v);
37: } else {
38: w = new THRUSTARRAY(n);
39: w->assign(v,v+n);
40: PetscLogCpuToGpu(n*sizeof(PetscScalar));
41: d_v = w->data();
42: }
44: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
45: cusp->coo_pw->begin()));
46: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
47: cusp->coo_pw->end()));
48: PetscLogGpuTimeBegin();
49: thrust::for_each(zibit,zieit,VecCUDAEquals());
50: cerr = WaitForCUDA();CHKERRCUDA(cerr);
51: PetscLogGpuTimeEnd();
52: delete w;
53: MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);
54: MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
55: } else {
56: MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);
57: MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);
58: }
59: PetscObjectStateIncrease((PetscObject)A);
60: A->num_ass++;
61: A->assembled = PETSC_TRUE;
62: A->ass_nonzerostate = A->nonzerostate;
63: A->offloadmask = PETSC_OFFLOAD_GPU;
64: return(0);
65: }
67: template <typename Tuple>
68: struct IsNotOffDiagT
69: {
70: PetscInt _cstart,_cend;
72: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
73: __host__ __device__
74: inline bool operator()(Tuple t)
75: {
76: return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
77: }
78: };
80: struct IsOffDiag
81: {
82: PetscInt _cstart,_cend;
84: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
85: __host__ __device__
86: inline bool operator() (const PetscInt &c)
87: {
88: return c < _cstart || c >= _cend;
89: }
90: };
92: struct GlobToLoc
93: {
94: PetscInt _start;
96: GlobToLoc(PetscInt start) : _start(start) {}
97: __host__ __device__
98: inline PetscInt operator() (const PetscInt &c)
99: {
100: return c - _start;
101: }
102: };
104: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
105: {
106: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
107: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
108: PetscErrorCode ierr;
109: PetscInt *jj;
110: size_t noff = 0;
111: THRUSTINTARRAY d_i(n);
112: THRUSTINTARRAY d_j(n);
113: ISLocalToGlobalMapping l2g;
114: cudaError_t cerr;
117: PetscLayoutSetUp(B->rmap);
118: PetscLayoutSetUp(B->cmap);
119: if (b->A) { MatCUSPARSEClearHandle(b->A); }
120: if (b->B) { MatCUSPARSEClearHandle(b->B); }
121: PetscFree(b->garray);
122: VecDestroy(&b->lvec);
123: MatDestroy(&b->A);
124: MatDestroy(&b->B);
126: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
127: d_i.assign(coo_i,coo_i+n);
128: d_j.assign(coo_j,coo_j+n);
129: delete cusp->coo_p;
130: delete cusp->coo_pw;
131: cusp->coo_p = NULL;
132: cusp->coo_pw = NULL;
133: PetscLogGpuTimeBegin();
134: auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
135: auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
136: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
137: cusp->coo_p = new THRUSTINTARRAY(n);
138: cusp->coo_pw = new THRUSTARRAY(n);
139: thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
140: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
141: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
142: auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
143: firstoffd = mzipp.get_iterator_tuple().get<1>();
144: }
145: cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
146: cusp->coo_no = thrust::distance(firstoffd,d_j.end());
148: /* from global to local */
149: thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
150: thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
151: cerr = WaitForCUDA();CHKERRCUDA(cerr);
152: PetscLogGpuTimeEnd();
154: /* copy offdiag column indices to map on the CPU */
155: PetscMalloc1(cusp->coo_no,&jj);
156: cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
157: auto o_j = d_j.begin();
158: PetscLogGpuTimeBegin();
159: thrust::advance(o_j,cusp->coo_nd);
160: thrust::sort(thrust::device,o_j,d_j.end());
161: auto wit = thrust::unique(thrust::device,o_j,d_j.end());
162: cerr = WaitForCUDA();CHKERRCUDA(cerr);
163: PetscLogGpuTimeEnd();
164: noff = thrust::distance(o_j,wit);
165: PetscMalloc1(noff+1,&b->garray);
166: cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
167: PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
168: ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
169: ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
170: ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);
171: if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
172: ISLocalToGlobalMappingDestroy(&l2g);
174: MatCreate(PETSC_COMM_SELF,&b->A);
175: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
176: MatSetType(b->A,MATSEQAIJCUSPARSE);
177: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
178: MatCreate(PETSC_COMM_SELF,&b->B);
179: MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
180: MatSetType(b->B,MATSEQAIJCUSPARSE);
181: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
183: /* GPU memory, cusparse specific call handles it internally */
184: MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
185: MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
186: PetscFree(jj);
188: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
189: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);
190: MatCUSPARSESetHandle(b->A,cusp->handle);
191: MatCUSPARSESetHandle(b->B,cusp->handle);
192: /*
193: MatCUSPARSESetStream(b->A,cusp->stream);
194: MatCUSPARSESetStream(b->B,cusp->stream);
195: */
196: MatSetUpMultiply_MPIAIJ(B);
197: B->preallocated = PETSC_TRUE;
198: B->nonzerostate++;
200: MatBindToCPU(b->A,B->boundtocpu);
201: MatBindToCPU(b->B,B->boundtocpu);
202: B->offloadmask = PETSC_OFFLOAD_CPU;
203: B->assembled = PETSC_FALSE;
204: B->was_assembled = PETSC_FALSE;
205: return(0);
206: }
208: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209: {
210: Mat Ad,Ao;
211: const PetscInt *cmap;
215: MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
216: MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
217: if (glob) {
218: PetscInt cst, i, dn, on, *gidx;
220: MatGetLocalSize(Ad,NULL,&dn);
221: MatGetLocalSize(Ao,NULL,&on);
222: MatGetOwnershipRangeColumn(A,&cst,NULL);
223: PetscMalloc1(dn+on,&gidx);
224: for (i=0; i<dn; i++) gidx[i] = cst + i;
225: for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226: ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
227: }
228: return(0);
229: }
231: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
232: {
233: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
234: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
235: PetscErrorCode ierr;
236: PetscInt i;
239: PetscLayoutSetUp(B->rmap);
240: PetscLayoutSetUp(B->cmap);
241: if (PetscDefined(USE_DEBUG) && d_nnz) {
242: for (i=0; i<B->rmap->n; i++) {
243: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
244: }
245: }
246: if (PetscDefined(USE_DEBUG) && o_nnz) {
247: for (i=0; i<B->rmap->n; i++) {
248: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
249: }
250: }
251: #if defined(PETSC_USE_CTABLE)
252: PetscTableDestroy(&b->colmap);
253: #else
254: PetscFree(b->colmap);
255: #endif
256: PetscFree(b->garray);
257: VecDestroy(&b->lvec);
258: VecScatterDestroy(&b->Mvctx);
259: /* Because the B will have been resized we simply destroy it and create a new one each time */
260: MatDestroy(&b->B);
261: if (!b->A) {
262: MatCreate(PETSC_COMM_SELF,&b->A);
263: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
264: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
265: }
266: if (!b->B) {
267: PetscMPIInt size;
268: MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
269: MatCreate(PETSC_COMM_SELF,&b->B);
270: MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
271: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
272: }
273: MatSetType(b->A,MATSEQAIJCUSPARSE);
274: MatSetType(b->B,MATSEQAIJCUSPARSE);
275: MatBindToCPU(b->A,B->boundtocpu);
276: MatBindToCPU(b->B,B->boundtocpu);
277: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
278: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
279: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
280: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
281: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
282: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
283: /* Let A, B use b's handle with pre-set stream
284: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
285: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
286: */
287: B->preallocated = PETSC_TRUE;
288: return(0);
289: }
291: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
292: {
293: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
295: PetscInt nt;
298: VecGetLocalSize(xx,&nt);
299: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
300: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
301: (*a->A->ops->mult)(a->A,xx,yy);
302: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
303: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
304: return(0);
305: }
307: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
308: {
309: Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
313: if (A->factortype == MAT_FACTOR_NONE) {
314: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
315: PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
316: if (d_mat) {
317: Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data;
318: Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data;
319: PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
320: cudaError_t err;
321: PetscScalar *vals;
322: PetscInfo(A,"Zero device matrix diag and offfdiag\n");
323: err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
324: err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
325: err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
326: err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
327: }
328: }
329: MatZeroEntries(l->A);
330: MatZeroEntries(l->B);
332: return(0);
333: }
334: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
335: {
336: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
338: PetscInt nt;
341: VecGetLocalSize(xx,&nt);
342: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
343: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
344: (*a->A->ops->multadd)(a->A,xx,yy,zz);
345: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
346: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
347: return(0);
348: }
350: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
351: {
352: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
354: PetscInt nt;
357: VecGetLocalSize(xx,&nt);
358: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
359: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
360: (*a->A->ops->multtranspose)(a->A,xx,yy);
361: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
362: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
363: return(0);
364: }
366: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
367: {
368: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
369: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
372: switch (op) {
373: case MAT_CUSPARSE_MULT_DIAG:
374: cusparseStruct->diagGPUMatFormat = format;
375: break;
376: case MAT_CUSPARSE_MULT_OFFDIAG:
377: cusparseStruct->offdiagGPUMatFormat = format;
378: break;
379: case MAT_CUSPARSE_ALL:
380: cusparseStruct->diagGPUMatFormat = format;
381: cusparseStruct->offdiagGPUMatFormat = format;
382: break;
383: default:
384: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
385: }
386: return(0);
387: }
389: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
390: {
391: MatCUSPARSEStorageFormat format;
392: PetscErrorCode ierr;
393: PetscBool flg;
394: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
395: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
398: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
399: if (A->factortype==MAT_FACTOR_NONE) {
400: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
401: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
402: if (flg) {
403: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
404: }
405: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
406: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
407: if (flg) {
408: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
409: }
410: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
411: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
412: if (flg) {
413: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
414: }
415: }
416: PetscOptionsTail();
417: return(0);
418: }
420: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
421: {
422: PetscErrorCode ierr;
423: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
424: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
425: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
427: MatAssemblyEnd_MPIAIJ(A,mode);
428: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
429: VecSetType(mpiaij->lvec,VECSEQCUDA);
430: #if defined(PETSC_HAVE_NVSHMEM)
431: {
432: PetscMPIInt result;
433: PetscBool useNvshmem = PETSC_FALSE;
434: PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);
435: if (useNvshmem) {
436: MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);
437: if (result == MPI_IDENT || result == MPI_CONGRUENT) {VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);}
438: }
439: }
440: #endif
441: }
442: if (d_mat) {
443: A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
444: }
445: return(0);
446: }
448: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
449: {
450: PetscErrorCode ierr;
451: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
452: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
453: cudaError_t err;
454: cusparseStatus_t stat;
457: if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
458: if (cusparseStruct->deviceMat) {
459: Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data;
460: Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
461: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
462: PetscInfo(A,"Have device matrix\n");
463: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
464: if (jaca->compressedrow.use) {
465: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
466: }
467: if (jacb->compressedrow.use) {
468: err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
469: }
470: err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
471: err = cudaFree(d_mat);CHKERRCUDA(err);
472: }
473: try {
474: if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
475: if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
476: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
477: /* We want cusparseStruct to use PetscDefaultCudaStream
478: if (cusparseStruct->stream) {
479: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
480: }
481: */
482: delete cusparseStruct->coo_p;
483: delete cusparseStruct->coo_pw;
484: delete cusparseStruct;
485: } catch(char *ex) {
486: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
487: }
488: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
489: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
490: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
491: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
492: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
493: MatDestroy_MPIAIJ(A);
494: return(0);
495: }
497: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
498: {
499: PetscErrorCode ierr;
500: Mat_MPIAIJ *a;
501: Mat_MPIAIJCUSPARSE *cusparseStruct;
502: cusparseStatus_t stat;
503: Mat A;
506: if (reuse == MAT_INITIAL_MATRIX) {
507: MatDuplicate(B,MAT_COPY_VALUES,newmat);
508: } else if (reuse == MAT_REUSE_MATRIX) {
509: MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
510: }
511: A = *newmat;
512: A->boundtocpu = PETSC_FALSE;
513: PetscFree(A->defaultvectype);
514: PetscStrallocpy(VECCUDA,&A->defaultvectype);
516: a = (Mat_MPIAIJ*)A->data;
517: if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
518: if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
519: if (a->lvec) {
520: VecSetType(a->lvec,VECSEQCUDA);
521: }
523: if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
524: a->spptr = new Mat_MPIAIJCUSPARSE;
526: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
527: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
528: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
529: cusparseStruct->coo_p = NULL;
530: cusparseStruct->coo_pw = NULL;
531: cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */
532: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
533: stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
534: cusparseStruct->deviceMat = NULL;
535: }
537: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
538: A->ops->mult = MatMult_MPIAIJCUSPARSE;
539: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
540: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
541: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
542: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
543: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
544: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
546: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
547: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
548: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
549: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
550: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
551: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
552: return(0);
553: }
555: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
556: {
560: PetscCUDAInitializeCheck();
561: MatCreate_MPIAIJ(A);
562: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
563: return(0);
564: }
566: /*@
567: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
568: (the default parallel PETSc format). This matrix will ultimately pushed down
569: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
570: assembly performance the user should preallocate the matrix storage by setting
571: the parameter nz (or the array nnz). By setting these parameters accurately,
572: performance during matrix assembly can be increased by more than a factor of 50.
574: Collective
576: Input Parameters:
577: + comm - MPI communicator, set to PETSC_COMM_SELF
578: . m - number of rows
579: . n - number of columns
580: . nz - number of nonzeros per row (same for all rows)
581: - nnz - array containing the number of nonzeros in the various rows
582: (possibly different for each row) or NULL
584: Output Parameter:
585: . A - the matrix
587: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
588: MatXXXXSetPreallocation() paradigm instead of this routine directly.
589: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
591: Notes:
592: If nnz is given then nz is ignored
594: The AIJ format (also called the Yale sparse matrix format or
595: compressed row storage), is fully compatible with standard Fortran 77
596: storage. That is, the stored row and column indices can begin at
597: either one (as in Fortran) or zero. See the users' manual for details.
599: Specify the preallocated storage with either nz or nnz (not both).
600: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
601: allocation. For large problems you MUST preallocate memory or you
602: will get TERRIBLE performance, see the users' manual chapter on matrices.
604: By default, this format uses inodes (identical nodes) when possible, to
605: improve numerical efficiency of matrix-vector products and solves. We
606: search for consecutive rows with the same nonzero structure, thereby
607: reusing matrix information to achieve increased efficiency.
609: Level: intermediate
611: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
612: @*/
613: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
614: {
616: PetscMPIInt size;
619: MatCreate(comm,A);
620: MatSetSizes(*A,m,n,M,N);
621: MPI_Comm_size(comm,&size);
622: if (size > 1) {
623: MatSetType(*A,MATMPIAIJCUSPARSE);
624: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
625: } else {
626: MatSetType(*A,MATSEQAIJCUSPARSE);
627: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
628: }
629: return(0);
630: }
632: /*MC
633: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
635: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
636: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
637: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
639: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
640: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
641: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
642: for communicators controlling multiple processes. It is recommended that you call both of
643: the above preallocation routines for simplicity.
645: Options Database Keys:
646: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
647: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
648: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
649: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
651: Level: beginner
653: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
654: M*/
656: /*MC
657: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
659: Level: beginner
661: .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
662: M*/
664: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
665: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
666: {
667: #if defined(PETSC_USE_CTABLE)
668: SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
669: #else
670: PetscSplitCSRDataStructure **p_d_mat;
671: PetscMPIInt size,rank;
672: MPI_Comm comm;
673: PetscErrorCode ierr;
674: int *ai,*bi,*aj,*bj;
675: PetscScalar *aa,*ba;
678: PetscObjectGetComm((PetscObject)A,&comm);
679: MPI_Comm_size(comm,&size);
680: MPI_Comm_rank(comm,&rank);
681: if (A->factortype == MAT_FACTOR_NONE) {
682: CsrMatrix *matrixA,*matrixB=NULL;
683: if (size == 1) {
684: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
685: p_d_mat = &cusparsestruct->deviceMat;
686: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
687: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
688: matrixA = (CsrMatrix*)matstruct->mat;
689: bi = bj = NULL; ba = NULL;
690: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
691: } else {
692: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
693: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
694: p_d_mat = &spptr->deviceMat;
695: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
696: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
697: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
698: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
699: if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
700: if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
701: matrixA = (CsrMatrix*)matstructA->mat;
702: matrixB = (CsrMatrix*)matstructB->mat;
703: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
704: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
705: ba = thrust::raw_pointer_cast(matrixB->values->data());
706: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
707: }
708: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
709: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
710: aa = thrust::raw_pointer_cast(matrixA->values->data());
711: } else {
712: *B = NULL;
713: return(0);
714: }
715: // act like MatSetValues because not called on host
716: if (A->assembled) {
717: if (A->was_assembled) {
718: PetscInfo(A,"Assemble more than once already\n");
719: }
720: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
721: } else {
722: SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
723: }
724: if (!*p_d_mat) {
725: cudaError_t err;
726: PetscSplitCSRDataStructure *d_mat, h_mat;
727: Mat_SeqAIJ *jaca;
728: PetscInt n = A->rmap->n, nnz;
729: // create and copy
730: PetscInfo(A,"Create device matrix\n");
731: err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
732: err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
733: *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
734: if (size == 1) {
735: jaca = (Mat_SeqAIJ*)A->data;
736: h_mat.rstart = 0; h_mat.rend = A->rmap->n;
737: h_mat.cstart = 0; h_mat.cend = A->cmap->n;
738: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
739: h_mat.offdiag.a = NULL;
740: } else {
741: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
742: Mat_SeqAIJ *jacb;
743: jaca = (Mat_SeqAIJ*)aij->A->data;
744: jacb = (Mat_SeqAIJ*)aij->B->data;
745: if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
746: if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
747: // create colmap - this is ussually done (lazy) in MatSetValues
748: aij->donotstash = PETSC_TRUE;
749: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
750: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
751: PetscCalloc1(A->cmap->N+1,&aij->colmap);
752: aij->colmap[A->cmap->N] = -9;
753: PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));
754: {
755: PetscInt ii;
756: for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
757: }
758: if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
759: // allocate B copy data
760: h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
761: h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
762: nnz = jacb->i[n];
764: if (jacb->compressedrow.use) {
765: err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
766: err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
767: } else h_mat.offdiag.i = bi;
768: h_mat.offdiag.j = bj;
769: h_mat.offdiag.a = ba;
771: err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
772: err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
773: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
774: h_mat.offdiag.n = n;
775: }
776: // allocate A copy data
777: nnz = jaca->i[n];
778: h_mat.diag.n = n;
779: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
780: MPI_Comm_rank(comm,&h_mat.rank);
781: if (jaca->compressedrow.use) {
782: err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
783: err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
784: } else {
785: h_mat.diag.i = ai;
786: }
787: h_mat.diag.j = aj;
788: h_mat.diag.a = aa;
789: // copy pointers and metdata to device
790: err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
791: PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);
792: } else {
793: *B = *p_d_mat;
794: }
795: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
796: return(0);
797: #endif
798: }