Actual source code: mpiaijcusparse.cu
petsc-3.12.5 2020-03-29
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_CXX_COMPLEX_FIX
3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
5: #include <petscconf.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10: {
11: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13: PetscErrorCode ierr;
14: PetscInt i;
17: PetscLayoutSetUp(B->rmap);
18: PetscLayoutSetUp(B->cmap);
19: if (d_nnz) {
20: for (i=0; i<B->rmap->n; i++) {
21: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
22: }
23: }
24: if (o_nnz) {
25: for (i=0; i<B->rmap->n; i++) {
26: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
27: }
28: }
29: if (!B->preallocated) {
30: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
31: MatCreate(PETSC_COMM_SELF,&b->A);
32: MatPinToCPU(b->A,B->pinnedtocpu);
33: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
34: MatSetType(b->A,MATSEQAIJCUSPARSE);
35: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
36: MatCreate(PETSC_COMM_SELF,&b->B);
37: MatPinToCPU(b->B,B->pinnedtocpu);
38: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
39: MatSetType(b->B,MATSEQAIJCUSPARSE);
40: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
41: }
42: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
43: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
44: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
45: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
46: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
47: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
48: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
49: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
51: B->preallocated = PETSC_TRUE;
52: return(0);
53: }
55: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
56: {
57: /*
58: This multiplication sequence is different sequence
59: than the CPU version. In particular, the diagonal block
60: multiplication kernel is launched in one stream. Then,
61: in a separate stream, the data transfers from DeviceToHost
62: (with MPI messaging in between), then HostToDevice are
63: launched. Once the data transfer stream is synchronized,
64: to ensure messaging is complete, the MatMultAdd kernel
65: is launched in the original (MatMult) stream to protect
66: against race conditions.
67: */
68: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
70: PetscInt nt;
73: VecGetLocalSize(xx,&nt);
74: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
75: VecScatterInitializeForGPU(a->Mvctx,xx);
76: (*a->A->ops->mult)(a->A,xx,yy);
77: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
78: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
79: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
80: VecScatterFinalizeForGPU(a->Mvctx);
81: return(0);
82: }
84: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
85: {
86: /*
87: This multiplication sequence is different sequence
88: than the CPU version. In particular, the diagonal block
89: multiplication kernel is launched in one stream. Then,
90: in a separate stream, the data transfers from DeviceToHost
91: (with MPI messaging in between), then HostToDevice are
92: launched. Once the data transfer stream is synchronized,
93: to ensure messaging is complete, the MatMultAdd kernel
94: is launched in the original (MatMult) stream to protect
95: against race conditions.
96: */
97: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
99: PetscInt nt;
102: VecGetLocalSize(xx,&nt);
103: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
104: VecScatterInitializeForGPU(a->Mvctx,xx);
105: (*a->A->ops->multadd)(a->A,xx,yy,zz);
106: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
107: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
108: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
109: VecScatterFinalizeForGPU(a->Mvctx);
110: return(0);
111: }
113: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114: {
115: /* This multiplication sequence is different sequence
116: than the CPU version. In particular, the diagonal block
117: multiplication kernel is launched in one stream. Then,
118: in a separate stream, the data transfers from DeviceToHost
119: (with MPI messaging in between), then HostToDevice are
120: launched. Once the data transfer stream is synchronized,
121: to ensure messaging is complete, the MatMultAdd kernel
122: is launched in the original (MatMult) stream to protect
123: against race conditions.
125: This sequence should only be called for GPU computation. */
126: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
128: PetscInt nt;
131: VecGetLocalSize(xx,&nt);
132: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
133: VecScatterInitializeForGPU(a->Mvctx,a->lvec);
134: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
135: (*a->A->ops->multtranspose)(a->A,xx,yy);
136: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
137: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
138: VecScatterFinalizeForGPU(a->Mvctx);
139: return(0);
140: }
142: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
143: {
144: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
145: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
148: switch (op) {
149: case MAT_CUSPARSE_MULT_DIAG:
150: cusparseStruct->diagGPUMatFormat = format;
151: break;
152: case MAT_CUSPARSE_MULT_OFFDIAG:
153: cusparseStruct->offdiagGPUMatFormat = format;
154: break;
155: case MAT_CUSPARSE_ALL:
156: cusparseStruct->diagGPUMatFormat = format;
157: cusparseStruct->offdiagGPUMatFormat = format;
158: break;
159: default:
160: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
161: }
162: return(0);
163: }
165: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
166: {
167: MatCUSPARSEStorageFormat format;
168: PetscErrorCode ierr;
169: PetscBool flg;
170: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
171: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
174: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
175: if (A->factortype==MAT_FACTOR_NONE) {
176: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
177: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
178: if (flg) {
179: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
180: }
181: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
183: if (flg) {
184: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
185: }
186: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
187: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
188: if (flg) {
189: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
190: }
191: }
192: PetscOptionsTail();
193: return(0);
194: }
196: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
197: {
199: Mat_MPIAIJ *mpiaij;
202: mpiaij = (Mat_MPIAIJ*)A->data;
203: MatAssemblyEnd_MPIAIJ(A,mode);
204: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205: VecSetType(mpiaij->lvec,VECSEQCUDA);
206: }
207: return(0);
208: }
210: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
211: {
212: PetscErrorCode ierr;
213: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
214: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
215: cudaError_t err;
216: cusparseStatus_t stat;
219: try {
220: MatCUSPARSEClearHandle(a->A);
221: MatCUSPARSEClearHandle(a->B);
222: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
223: if (cusparseStruct->stream) {
224: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
225: }
226: delete cusparseStruct;
227: } catch(char *ex) {
228: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
229: }
230: MatDestroy_MPIAIJ(A);
231: return(0);
232: }
234: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
235: {
236: PetscErrorCode ierr;
237: Mat_MPIAIJ *a;
238: Mat_MPIAIJCUSPARSE * cusparseStruct;
239: cusparseStatus_t stat;
242: MatCreate_MPIAIJ(A);
243: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
244: PetscFree(A->defaultvectype);
245: PetscStrallocpy(VECCUDA,&A->defaultvectype);
247: a = (Mat_MPIAIJ*)A->data;
248: a->spptr = new Mat_MPIAIJCUSPARSE;
250: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
251: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
252: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
253: cusparseStruct->stream = 0;
254: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
256: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
257: A->ops->mult = MatMult_MPIAIJCUSPARSE;
258: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
259: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
260: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
261: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
263: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
264: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
265: return(0);
266: }
268: /*@
269: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
270: (the default parallel PETSc format). This matrix will ultimately pushed down
271: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
272: assembly performance the user should preallocate the matrix storage by setting
273: the parameter nz (or the array nnz). By setting these parameters accurately,
274: performance during matrix assembly can be increased by more than a factor of 50.
276: Collective
278: Input Parameters:
279: + comm - MPI communicator, set to PETSC_COMM_SELF
280: . m - number of rows
281: . n - number of columns
282: . nz - number of nonzeros per row (same for all rows)
283: - nnz - array containing the number of nonzeros in the various rows
284: (possibly different for each row) or NULL
286: Output Parameter:
287: . A - the matrix
289: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
290: MatXXXXSetPreallocation() paradigm instead of this routine directly.
291: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293: Notes:
294: If nnz is given then nz is ignored
296: The AIJ format (also called the Yale sparse matrix format or
297: compressed row storage), is fully compatible with standard Fortran 77
298: storage. That is, the stored row and column indices can begin at
299: either one (as in Fortran) or zero. See the users' manual for details.
301: Specify the preallocated storage with either nz or nnz (not both).
302: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
303: allocation. For large problems you MUST preallocate memory or you
304: will get TERRIBLE performance, see the users' manual chapter on matrices.
306: By default, this format uses inodes (identical nodes) when possible, to
307: improve numerical efficiency of matrix-vector products and solves. We
308: search for consecutive rows with the same nonzero structure, thereby
309: reusing matrix information to achieve increased efficiency.
311: Level: intermediate
313: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
314: @*/
315: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
316: {
318: PetscMPIInt size;
321: MatCreate(comm,A);
322: MatSetSizes(*A,m,n,M,N);
323: MPI_Comm_size(comm,&size);
324: if (size > 1) {
325: MatSetType(*A,MATMPIAIJCUSPARSE);
326: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
327: } else {
328: MatSetType(*A,MATSEQAIJCUSPARSE);
329: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
330: }
331: return(0);
332: }
334: /*MC
335: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
337: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
338: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
339: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
341: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
342: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
343: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
344: for communicators controlling multiple processes. It is recommended that you call both of
345: the above preallocation routines for simplicity.
347: Options Database Keys:
348: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
349: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
350: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
351: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
353: Level: beginner
355: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
356: M
357: M*/