Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4: #include <petscconf.h>
5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8: #include <thrust/advance.h>
9: #include <petscsf.h>
11: struct VecCUDAEquals
12: {
13: template <typename Tuple>
14: __host__ __device__
15: void operator()(Tuple t)
16: {
17: thrust::get<1>(t) = thrust::get<0>(t);
18: }
19: };
21: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
22: {
23: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
24: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
25: PetscInt n = cusp->coo_nd + cusp->coo_no;
26: PetscErrorCode ierr;
29: if (cusp->coo_p && v) {
30: thrust::device_ptr<const PetscScalar> d_v;
31: THRUSTARRAY *w = NULL;
33: if (isCudaMem(v)) {
34: d_v = thrust::device_pointer_cast(v);
35: } else {
36: w = new THRUSTARRAY(n);
37: w->assign(v,v+n);
38: PetscLogCpuToGpu(n*sizeof(PetscScalar));
39: d_v = w->data();
40: }
42: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
43: cusp->coo_pw->begin()));
44: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
45: cusp->coo_pw->end()));
46: PetscLogGpuTimeBegin();
47: thrust::for_each(zibit,zieit,VecCUDAEquals());
48: PetscLogGpuTimeEnd();
49: delete w;
50: MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);
51: MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);
52: } else {
53: MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);
54: MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);
55: }
56: PetscObjectStateIncrease((PetscObject)A);
57: A->num_ass++;
58: A->assembled = PETSC_TRUE;
59: A->ass_nonzerostate = A->nonzerostate;
60: A->offloadmask = PETSC_OFFLOAD_GPU;
61: return(0);
62: }
64: template <typename Tuple>
65: struct IsNotOffDiagT
66: {
67: PetscInt _cstart,_cend;
69: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
70: __host__ __device__
71: inline bool operator()(Tuple t)
72: {
73: return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
74: }
75: };
77: struct IsOffDiag
78: {
79: PetscInt _cstart,_cend;
81: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
82: __host__ __device__
83: inline bool operator() (const PetscInt &c)
84: {
85: return c < _cstart || c >= _cend;
86: }
87: };
89: struct GlobToLoc
90: {
91: PetscInt _start;
93: GlobToLoc(PetscInt start) : _start(start) {}
94: __host__ __device__
95: inline PetscInt operator() (const PetscInt &c)
96: {
97: return c - _start;
98: }
99: };
101: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
102: {
103: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
104: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
105: PetscErrorCode ierr;
106: PetscInt *jj;
107: size_t noff = 0;
108: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
109: THRUSTINTARRAY d_j(n);
110: ISLocalToGlobalMapping l2g;
111: cudaError_t cerr;
114: PetscLayoutSetUp(B->rmap);
115: PetscLayoutSetUp(B->cmap);
116: if (b->A) { MatCUSPARSEClearHandle(b->A); }
117: if (b->B) { MatCUSPARSEClearHandle(b->B); }
118: PetscFree(b->garray);
119: VecDestroy(&b->lvec);
120: MatDestroy(&b->A);
121: MatDestroy(&b->B);
123: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
124: d_i.assign(coo_i,coo_i+n);
125: d_j.assign(coo_j,coo_j+n);
126: delete cusp->coo_p;
127: delete cusp->coo_pw;
128: cusp->coo_p = NULL;
129: cusp->coo_pw = NULL;
130: PetscLogGpuTimeBegin();
131: auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
132: auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
133: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
134: cusp->coo_p = new THRUSTINTARRAY(n);
135: cusp->coo_pw = new THRUSTARRAY(n);
136: thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
137: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
138: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
139: auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
140: firstoffd = mzipp.get_iterator_tuple().get<1>();
141: }
142: cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
143: cusp->coo_no = thrust::distance(firstoffd,d_j.end());
145: /* from global to local */
146: thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
147: thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
148: PetscLogGpuTimeEnd();
150: /* copy offdiag column indices to map on the CPU */
151: PetscMalloc1(cusp->coo_no,&jj); /* jj[] will store compacted col ids of the offdiag part */
152: cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
153: auto o_j = d_j.begin();
154: PetscLogGpuTimeBegin();
155: thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
156: thrust::sort(thrust::device,o_j,d_j.end());
157: auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
158: PetscLogGpuTimeEnd();
159: noff = thrust::distance(o_j,wit);
160: PetscMalloc1(noff,&b->garray);
161: cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
162: PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));
163: ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);
164: ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);
165: ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);
166: if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
167: ISLocalToGlobalMappingDestroy(&l2g);
169: MatCreate(PETSC_COMM_SELF,&b->A);
170: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
171: MatSetType(b->A,MATSEQAIJCUSPARSE);
172: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
173: MatCreate(PETSC_COMM_SELF,&b->B);
174: MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);
175: MatSetType(b->B,MATSEQAIJCUSPARSE);
176: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
178: /* GPU memory, cusparse specific call handles it internally */
179: MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());
180: MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);
181: PetscFree(jj);
183: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);
184: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);
185: MatCUSPARSESetHandle(b->A,cusp->handle);
186: MatCUSPARSESetHandle(b->B,cusp->handle);
187: /*
188: MatCUSPARSESetStream(b->A,cusp->stream);
189: MatCUSPARSESetStream(b->B,cusp->stream);
190: */
191: MatSetUpMultiply_MPIAIJ(B);
192: B->preallocated = PETSC_TRUE;
193: B->nonzerostate++;
195: MatBindToCPU(b->A,B->boundtocpu);
196: MatBindToCPU(b->B,B->boundtocpu);
197: B->offloadmask = PETSC_OFFLOAD_CPU;
198: B->assembled = PETSC_FALSE;
199: B->was_assembled = PETSC_FALSE;
200: return(0);
201: }
203: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204: {
205: Mat Ad,Ao;
206: const PetscInt *cmap;
210: MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);
211: MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);
212: if (glob) {
213: PetscInt cst, i, dn, on, *gidx;
215: MatGetLocalSize(Ad,NULL,&dn);
216: MatGetLocalSize(Ao,NULL,&on);
217: MatGetOwnershipRangeColumn(A,&cst,NULL);
218: PetscMalloc1(dn+on,&gidx);
219: for (i=0; i<dn; i++) gidx[i] = cst + i;
220: for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221: ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);
222: }
223: return(0);
224: }
226: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
227: {
228: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
229: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
230: PetscErrorCode ierr;
231: PetscInt i;
234: PetscLayoutSetUp(B->rmap);
235: PetscLayoutSetUp(B->cmap);
236: if (PetscDefined(USE_DEBUG) && d_nnz) {
237: for (i=0; i<B->rmap->n; i++) {
238: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239: }
240: }
241: if (PetscDefined(USE_DEBUG) && o_nnz) {
242: for (i=0; i<B->rmap->n; i++) {
243: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
244: }
245: }
246: #if defined(PETSC_USE_CTABLE)
247: PetscTableDestroy(&b->colmap);
248: #else
249: PetscFree(b->colmap);
250: #endif
251: PetscFree(b->garray);
252: VecDestroy(&b->lvec);
253: VecScatterDestroy(&b->Mvctx);
254: /* Because the B will have been resized we simply destroy it and create a new one each time */
255: MatDestroy(&b->B);
256: if (!b->A) {
257: MatCreate(PETSC_COMM_SELF,&b->A);
258: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
259: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
260: }
261: if (!b->B) {
262: PetscMPIInt size;
263: MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
264: MatCreate(PETSC_COMM_SELF,&b->B);
265: MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);
266: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
267: }
268: MatSetType(b->A,MATSEQAIJCUSPARSE);
269: MatSetType(b->B,MATSEQAIJCUSPARSE);
270: MatBindToCPU(b->A,B->boundtocpu);
271: MatBindToCPU(b->B,B->boundtocpu);
272: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
273: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
274: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
275: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
276: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
277: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
278: /* Let A, B use b's handle with pre-set stream
279: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
280: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
281: */
282: B->preallocated = PETSC_TRUE;
283: return(0);
284: }
286: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
287: {
288: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
290: PetscInt nt;
293: VecGetLocalSize(xx,&nt);
294: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
295: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
296: (*a->A->ops->mult)(a->A,xx,yy);
297: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
298: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
299: return(0);
300: }
302: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
303: {
304: Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
308: MatZeroEntries(l->A);
309: MatZeroEntries(l->B);
310: return(0);
311: }
313: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
314: {
315: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
317: PetscInt nt;
320: VecGetLocalSize(xx,&nt);
321: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
322: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
323: (*a->A->ops->multadd)(a->A,xx,yy,zz);
324: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
325: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
326: return(0);
327: }
329: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
330: {
331: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
333: PetscInt nt;
336: VecGetLocalSize(xx,&nt);
337: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
338: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
339: (*a->A->ops->multtranspose)(a->A,xx,yy);
340: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
341: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
342: return(0);
343: }
345: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
346: {
347: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
348: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
351: switch (op) {
352: case MAT_CUSPARSE_MULT_DIAG:
353: cusparseStruct->diagGPUMatFormat = format;
354: break;
355: case MAT_CUSPARSE_MULT_OFFDIAG:
356: cusparseStruct->offdiagGPUMatFormat = format;
357: break;
358: case MAT_CUSPARSE_ALL:
359: cusparseStruct->diagGPUMatFormat = format;
360: cusparseStruct->offdiagGPUMatFormat = format;
361: break;
362: default:
363: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
364: }
365: return(0);
366: }
368: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
369: {
370: MatCUSPARSEStorageFormat format;
371: PetscErrorCode ierr;
372: PetscBool flg;
373: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
374: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
377: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
378: if (A->factortype==MAT_FACTOR_NONE) {
379: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
380: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
381: if (flg) {
382: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
383: }
384: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
385: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
386: if (flg) {
387: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
388: }
389: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
390: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
391: if (flg) {
392: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
393: }
394: }
395: PetscOptionsTail();
396: return(0);
397: }
399: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
400: {
401: PetscErrorCode ierr;
402: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
403: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
404: PetscObjectState onnz = A->nonzerostate;
407: MatAssemblyEnd_MPIAIJ(A,mode);
408: if (mpiaij->lvec) { VecSetType(mpiaij->lvec,VECSEQCUDA); }
409: if (onnz != A->nonzerostate && cusp->deviceMat) {
410: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
411: cudaError_t cerr;
413: PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
414: PetscNew(&h_mat);
415: cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
416: cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
417: cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
418: PetscFree(h_mat);
419: cusp->deviceMat = NULL;
420: }
421: return(0);
422: }
424: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
425: {
426: PetscErrorCode ierr;
427: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
428: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
429: cusparseStatus_t stat;
432: if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
433: if (cusparseStruct->deviceMat) {
434: PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
435: cudaError_t cerr;
437: PetscInfo(A,"Have device matrix\n");
438: PetscNew(&h_mat);
439: cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
440: cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
441: cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
442: PetscFree(h_mat);
443: }
444: try {
445: if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
446: if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
447: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
448: /* We want cusparseStruct to use PetscDefaultCudaStream
449: if (cusparseStruct->stream) {
450: cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
451: }
452: */
453: delete cusparseStruct->coo_p;
454: delete cusparseStruct->coo_pw;
455: delete cusparseStruct;
456: } catch(char *ex) {
457: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
458: }
459: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
460: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);
461: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
462: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
463: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
464: PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);
465: MatDestroy_MPIAIJ(A);
466: return(0);
467: }
469: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
470: {
471: PetscErrorCode ierr;
472: Mat_MPIAIJ *a;
473: Mat_MPIAIJCUSPARSE *cusparseStruct;
474: cusparseStatus_t stat;
475: Mat A;
478: if (reuse == MAT_INITIAL_MATRIX) {
479: MatDuplicate(B,MAT_COPY_VALUES,newmat);
480: } else if (reuse == MAT_REUSE_MATRIX) {
481: MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
482: }
483: A = *newmat;
484: A->boundtocpu = PETSC_FALSE;
485: PetscFree(A->defaultvectype);
486: PetscStrallocpy(VECCUDA,&A->defaultvectype);
488: a = (Mat_MPIAIJ*)A->data;
489: if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
490: if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
491: if (a->lvec) {
492: VecSetType(a->lvec,VECSEQCUDA);
493: }
495: if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
496: a->spptr = new Mat_MPIAIJCUSPARSE;
498: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
499: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
500: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
501: cusparseStruct->coo_p = NULL;
502: cusparseStruct->coo_pw = NULL;
503: cusparseStruct->stream = 0;
504: cusparseStruct->deviceMat = NULL;
505: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
506: }
508: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
509: A->ops->mult = MatMult_MPIAIJCUSPARSE;
510: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
511: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
512: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
513: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
514: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
515: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
517: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
518: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
519: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
520: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
521: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);
522: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);
523: #if defined(PETSC_HAVE_HYPRE)
524: PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
525: #endif
526: return(0);
527: }
529: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
530: {
534: PetscCUDAInitializeCheck();
535: MatCreate_MPIAIJ(A);
536: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
537: return(0);
538: }
540: /*@
541: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
542: (the default parallel PETSc format). This matrix will ultimately pushed down
543: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
544: assembly performance the user should preallocate the matrix storage by setting
545: the parameter nz (or the array nnz). By setting these parameters accurately,
546: performance during matrix assembly can be increased by more than a factor of 50.
548: Collective
550: Input Parameters:
551: + comm - MPI communicator, set to PETSC_COMM_SELF
552: . m - number of rows
553: . n - number of columns
554: . nz - number of nonzeros per row (same for all rows)
555: - nnz - array containing the number of nonzeros in the various rows
556: (possibly different for each row) or NULL
558: Output Parameter:
559: . A - the matrix
561: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
562: MatXXXXSetPreallocation() paradigm instead of this routine directly.
563: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
565: Notes:
566: If nnz is given then nz is ignored
568: The AIJ format (also called the Yale sparse matrix format or
569: compressed row storage), is fully compatible with standard Fortran 77
570: storage. That is, the stored row and column indices can begin at
571: either one (as in Fortran) or zero. See the users' manual for details.
573: Specify the preallocated storage with either nz or nnz (not both).
574: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
575: allocation. For large problems you MUST preallocate memory or you
576: will get TERRIBLE performance, see the users' manual chapter on matrices.
578: By default, this format uses inodes (identical nodes) when possible, to
579: improve numerical efficiency of matrix-vector products and solves. We
580: search for consecutive rows with the same nonzero structure, thereby
581: reusing matrix information to achieve increased efficiency.
583: Level: intermediate
585: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
586: @*/
587: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
588: {
590: PetscMPIInt size;
593: MatCreate(comm,A);
594: MatSetSizes(*A,m,n,M,N);
595: MPI_Comm_size(comm,&size);
596: if (size > 1) {
597: MatSetType(*A,MATMPIAIJCUSPARSE);
598: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
599: } else {
600: MatSetType(*A,MATSEQAIJCUSPARSE);
601: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
602: }
603: return(0);
604: }
606: /*MC
607: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
609: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
610: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
611: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
613: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
614: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
615: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
616: for communicators controlling multiple processes. It is recommended that you call both of
617: the above preallocation routines for simplicity.
619: Options Database Keys:
620: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
621: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
622: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
623: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
625: Level: beginner
627: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
628: M*/
630: /*MC
631: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
633: Level: beginner
635: .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
636: M*/
638: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
640: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
641: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
642: {
643: PetscSplitCSRDataStructure d_mat;
644: PetscMPIInt size;
645: PetscErrorCode ierr;
646: int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
647: PetscScalar *aa = NULL,*ba = NULL;
648: Mat_SeqAIJ *jaca = NULL;
649: Mat_SeqAIJCUSPARSE *cusparsestructA = NULL;
650: CsrMatrix *matrixA = NULL,*matrixB = NULL;
653: if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
654: if (A->factortype != MAT_FACTOR_NONE) {
655: *B = NULL;
656: return(0);
657: }
658: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
659: if (size == 1) {
660: PetscBool isseqaij;
662: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);
663: if (isseqaij) {
664: jaca = (Mat_SeqAIJ*)A->data;
665: if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
666: cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
667: d_mat = cusparsestructA->deviceMat;
668: MatSeqAIJCUSPARSECopyToGPU(A);
669: } else {
670: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
671: if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
672: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
673: jaca = (Mat_SeqAIJ*)aij->A->data;
674: cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
675: d_mat = spptr->deviceMat;
676: MatSeqAIJCUSPARSECopyToGPU(aij->A);
677: }
678: if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
679: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
680: if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
681: matrixA = (CsrMatrix*)matstruct->mat;
682: bi = NULL;
683: bj = NULL;
684: ba = NULL;
685: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
686: } else {
687: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
688: if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
689: jaca = (Mat_SeqAIJ*)aij->A->data;
690: Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
691: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
693: if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
694: cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
695: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
696: if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
697: if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
698: MatSeqAIJCUSPARSECopyToGPU(aij->A);
699: MatSeqAIJCUSPARSECopyToGPU(aij->B);
700: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
701: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
702: if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
703: if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
704: matrixA = (CsrMatrix*)matstructA->mat;
705: matrixB = (CsrMatrix*)matstructB->mat;
706: if (jacb->compressedrow.use) {
707: if (!cusparsestructB->rowoffsets_gpu) {
708: cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
709: cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
710: }
711: bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
712: } else {
713: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
714: }
715: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
716: ba = thrust::raw_pointer_cast(matrixB->values->data());
717: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
718: d_mat = spptr->deviceMat;
719: }
720: if (jaca->compressedrow.use) {
721: if (!cusparsestructA->rowoffsets_gpu) {
722: cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
723: cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
724: }
725: ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
726: } else {
727: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
728: }
729: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
730: aa = thrust::raw_pointer_cast(matrixA->values->data());
732: if (!d_mat) {
733: cudaError_t cerr;
734: PetscSplitCSRDataStructure h_mat;
736: // create and populate strucy on host and copy on device
737: PetscInfo(A,"Create device matrix\n");
738: PetscNew(&h_mat);
739: cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
740: if (size > 1) { /* need the colmap array */
741: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
742: int *colmap;
743: PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N;
745: if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
747: PetscCalloc1(N+1,&colmap);
748: for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
750: h_mat->offdiag.i = bi;
751: h_mat->offdiag.j = bj;
752: h_mat->offdiag.a = ba;
753: h_mat->offdiag.n = A->rmap->n;
755: cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr);
756: cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
757: PetscFree(colmap);
758: }
759: h_mat->rstart = A->rmap->rstart;
760: h_mat->rend = A->rmap->rend;
761: h_mat->cstart = A->cmap->rstart;
762: h_mat->cend = A->cmap->rend;
763: h_mat->N = A->cmap->N;
764: h_mat->diag.i = ai;
765: h_mat->diag.j = aj;
766: h_mat->diag.a = aa;
767: h_mat->diag.n = A->rmap->n;
768: h_mat->rank = PetscGlobalRank;
769: // copy pointers and metadata to device
770: cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
771: PetscFree(h_mat);
772: } else {
773: PetscInfo(A,"Reusing device matrix\n");
774: }
775: *B = d_mat;
776: A->offloadmask = PETSC_OFFLOAD_GPU;
777: return(0);
778: }