Actual source code: mpiaijcusparse.cu
petsc-3.13.6 2020-09-29
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_CXX_COMPLEX_FIX
3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
5: #include <petscconf.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
10: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11: {
12: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
13: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14: PetscErrorCode ierr;
15: PetscInt i;
18: PetscLayoutSetUp(B->rmap);
19: PetscLayoutSetUp(B->cmap);
20: if (d_nnz) {
21: for (i=0; i<B->rmap->n; i++) {
22: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23: }
24: }
25: if (o_nnz) {
26: for (i=0; i<B->rmap->n; i++) {
27: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28: }
29: }
30: if (!B->preallocated) {
31: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32: MatCreate(PETSC_COMM_SELF,&b->A);
33: MatBindToCPU(b->A,B->boundtocpu);
34: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
35: MatSetType(b->A,MATSEQAIJCUSPARSE);
36: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
37: MatCreate(PETSC_COMM_SELF,&b->B);
38: MatBindToCPU(b->B,B->boundtocpu);
39: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
40: MatSetType(b->B,MATSEQAIJCUSPARSE);
41: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
42: }
43: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
44: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
45: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
46: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
47: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
48: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
49: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
50: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
52: B->preallocated = PETSC_TRUE;
53: return(0);
54: }
56: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
57: {
58: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
60: PetscInt nt;
63: VecGetLocalSize(xx,&nt);
64: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65: VecScatterInitializeForGPU(a->Mvctx,xx);
66: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
67: (*a->A->ops->mult)(a->A,xx,yy);
68: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
69: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
70: VecScatterFinalizeForGPU(a->Mvctx);
71: return(0);
72: }
74: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
75: {
76: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
78: PetscInt nt;
81: VecGetLocalSize(xx,&nt);
82: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
83: VecScatterInitializeForGPU(a->Mvctx,xx);
84: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
85: (*a->A->ops->multadd)(a->A,xx,yy,zz);
86: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
87: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
88: VecScatterFinalizeForGPU(a->Mvctx);
89: return(0);
90: }
92: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
93: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
96: PetscInt nt;
99: VecGetLocalSize(xx,&nt);
100: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
101: VecScatterInitializeForGPU(a->Mvctx,a->lvec);
102: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
103: (*a->A->ops->multtranspose)(a->A,xx,yy);
104: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
105: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
106: VecScatterFinalizeForGPU(a->Mvctx);
107: return(0);
108: }
110: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
111: {
112: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
113: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
116: switch (op) {
117: case MAT_CUSPARSE_MULT_DIAG:
118: cusparseStruct->diagGPUMatFormat = format;
119: break;
120: case MAT_CUSPARSE_MULT_OFFDIAG:
121: cusparseStruct->offdiagGPUMatFormat = format;
122: break;
123: case MAT_CUSPARSE_ALL:
124: cusparseStruct->diagGPUMatFormat = format;
125: cusparseStruct->offdiagGPUMatFormat = format;
126: break;
127: default:
128: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
129: }
130: return(0);
131: }
133: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
134: {
135: MatCUSPARSEStorageFormat format;
136: PetscErrorCode ierr;
137: PetscBool flg;
138: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
139: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
142: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
143: if (A->factortype==MAT_FACTOR_NONE) {
144: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
145: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
146: if (flg) {
147: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
148: }
149: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
150: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
151: if (flg) {
152: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
153: }
154: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
155: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
156: if (flg) {
157: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
158: }
159: }
160: PetscOptionsTail();
161: return(0);
162: }
164: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
165: {
167: Mat_MPIAIJ *mpiaij;
170: mpiaij = (Mat_MPIAIJ*)A->data;
171: MatAssemblyEnd_MPIAIJ(A,mode);
172: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
173: VecSetType(mpiaij->lvec,VECSEQCUDA);
174: }
175: return(0);
176: }
178: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
179: {
180: PetscErrorCode ierr;
181: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
182: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
183: cudaError_t err;
184: cusparseStatus_t stat;
187: try {
188: MatCUSPARSEClearHandle(a->A);
189: MatCUSPARSEClearHandle(a->B);
190: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
191: if (cusparseStruct->stream) {
192: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
193: }
194: delete cusparseStruct;
195: } catch(char *ex) {
196: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
197: }
198: MatDestroy_MPIAIJ(A);
199: return(0);
200: }
202: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
203: {
204: PetscErrorCode ierr;
205: Mat_MPIAIJ *a;
206: Mat_MPIAIJCUSPARSE * cusparseStruct;
207: cusparseStatus_t stat;
210: MatCreate_MPIAIJ(A);
211: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
212: PetscFree(A->defaultvectype);
213: PetscStrallocpy(VECCUDA,&A->defaultvectype);
215: a = (Mat_MPIAIJ*)A->data;
216: a->spptr = new Mat_MPIAIJCUSPARSE;
218: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
219: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
220: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
221: cusparseStruct->stream = 0;
222: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
224: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
225: A->ops->mult = MatMult_MPIAIJCUSPARSE;
226: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
227: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
228: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
229: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
231: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
232: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
233: return(0);
234: }
236: /*@
237: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
238: (the default parallel PETSc format). This matrix will ultimately pushed down
239: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
240: assembly performance the user should preallocate the matrix storage by setting
241: the parameter nz (or the array nnz). By setting these parameters accurately,
242: performance during matrix assembly can be increased by more than a factor of 50.
244: Collective
246: Input Parameters:
247: + comm - MPI communicator, set to PETSC_COMM_SELF
248: . m - number of rows
249: . n - number of columns
250: . nz - number of nonzeros per row (same for all rows)
251: - nnz - array containing the number of nonzeros in the various rows
252: (possibly different for each row) or NULL
254: Output Parameter:
255: . A - the matrix
257: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
258: MatXXXXSetPreallocation() paradigm instead of this routine directly.
259: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
261: Notes:
262: If nnz is given then nz is ignored
264: The AIJ format (also called the Yale sparse matrix format or
265: compressed row storage), is fully compatible with standard Fortran 77
266: storage. That is, the stored row and column indices can begin at
267: either one (as in Fortran) or zero. See the users' manual for details.
269: Specify the preallocated storage with either nz or nnz (not both).
270: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
271: allocation. For large problems you MUST preallocate memory or you
272: will get TERRIBLE performance, see the users' manual chapter on matrices.
274: By default, this format uses inodes (identical nodes) when possible, to
275: improve numerical efficiency of matrix-vector products and solves. We
276: search for consecutive rows with the same nonzero structure, thereby
277: reusing matrix information to achieve increased efficiency.
279: Level: intermediate
281: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
282: @*/
283: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
284: {
286: PetscMPIInt size;
289: MatCreate(comm,A);
290: MatSetSizes(*A,m,n,M,N);
291: MPI_Comm_size(comm,&size);
292: if (size > 1) {
293: MatSetType(*A,MATMPIAIJCUSPARSE);
294: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
295: } else {
296: MatSetType(*A,MATSEQAIJCUSPARSE);
297: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
298: }
299: return(0);
300: }
302: /*MC
303: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
305: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
306: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
307: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
309: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
310: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
311: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
312: for communicators controlling multiple processes. It is recommended that you call both of
313: the above preallocation routines for simplicity.
315: Options Database Keys:
316: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
317: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
318: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
319: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
321: Level: beginner
323: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
324: M
325: M*/