Actual source code: mpiaijcusparse.cu
petsc-3.9.4 2018-09-11
1: #define PETSC_SKIP_SPINLOCK
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
8: {
9: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
10: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
11: PetscErrorCode ierr;
12: PetscInt i;
15: PetscLayoutSetUp(B->rmap);
16: PetscLayoutSetUp(B->cmap);
17: if (d_nnz) {
18: for (i=0; i<B->rmap->n; i++) {
19: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
20: }
21: }
22: if (o_nnz) {
23: for (i=0; i<B->rmap->n; i++) {
24: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
25: }
26: }
27: if (!B->preallocated) {
28: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
29: MatCreate(PETSC_COMM_SELF,&b->A);
30: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
31: MatSetType(b->A,MATSEQAIJCUSPARSE);
32: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
33: MatCreate(PETSC_COMM_SELF,&b->B);
34: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
35: MatSetType(b->B,MATSEQAIJCUSPARSE);
36: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
37: }
38: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
39: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
40: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
41: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
42: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
43: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
44: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
45: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
47: B->preallocated = PETSC_TRUE;
48: return(0);
49: }
51: PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
52: {
54: PetscInt rbs,cbs;
57: MatGetBlockSizes(mat,&rbs,&cbs);
58: if (right) {
59: VecCreate(PetscObjectComm((PetscObject)mat),right);
60: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
61: VecSetBlockSize(*right,cbs);
62: VecSetType(*right,VECCUDA);
63: VecSetLayout(*right,mat->cmap);
64: }
65: if (left) {
66: VecCreate(PetscObjectComm((PetscObject)mat),left);
67: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
68: VecSetBlockSize(*left,rbs);
69: VecSetType(*left,VECCUDA);
70: VecSetLayout(*left,mat->rmap);
73: }
74: return(0);
75: }
77: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
78: {
79: /* This multiplication sequence is different sequence
80: than the CPU version. In particular, the diagonal block
81: multiplication kernel is launched in one stream. Then,
82: in a separate stream, the data transfers from DeviceToHost
83: (with MPI messaging in between), then HostToDevice are
84: launched. Once the data transfer stream is synchronized,
85: to ensure messaging is complete, the MatMultAdd kernel
86: is launched in the original (MatMult) stream to protect
87: against race conditions.
89: This sequence should only be called for GPU computation. */
90: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
92: PetscInt nt;
95: VecGetLocalSize(xx,&nt);
96: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
97: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
98: (*a->A->ops->mult)(a->A,xx,yy);
99: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
100: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
101: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
102: VecScatterFinalizeForGPU(a->Mvctx);
103: return(0);
104: }
106: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
107: {
108: /* This multiplication sequence is different sequence
109: than the CPU version. In particular, the diagonal block
110: multiplication kernel is launched in one stream. Then,
111: in a separate stream, the data transfers from DeviceToHost
112: (with MPI messaging in between), then HostToDevice are
113: launched. Once the data transfer stream is synchronized,
114: to ensure messaging is complete, the MatMultAdd kernel
115: is launched in the original (MatMult) stream to protect
116: against race conditions.
118: This sequence should only be called for GPU computation. */
119: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
121: PetscInt nt;
124: VecGetLocalSize(xx,&nt);
125: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
126: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);
127: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
128: (*a->A->ops->multtranspose)(a->A,xx,yy);
129: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
130: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
131: VecScatterFinalizeForGPU(a->Mvctx);
132: return(0);
133: }
135: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
136: {
137: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
138: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
141: switch (op) {
142: case MAT_CUSPARSE_MULT_DIAG:
143: cusparseStruct->diagGPUMatFormat = format;
144: break;
145: case MAT_CUSPARSE_MULT_OFFDIAG:
146: cusparseStruct->offdiagGPUMatFormat = format;
147: break;
148: case MAT_CUSPARSE_ALL:
149: cusparseStruct->diagGPUMatFormat = format;
150: cusparseStruct->offdiagGPUMatFormat = format;
151: break;
152: default:
153: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
154: }
155: return(0);
156: }
158: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
159: {
160: MatCUSPARSEStorageFormat format;
161: PetscErrorCode ierr;
162: PetscBool flg;
163: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
164: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
167: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
168: PetscObjectOptionsBegin((PetscObject)A);
169: if (A->factortype==MAT_FACTOR_NONE) {
170: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
171: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
172: if (flg) {
173: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
174: }
175: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
176: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
177: if (flg) {
178: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
179: }
180: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
182: if (flg) {
183: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
184: }
185: }
186: PetscOptionsEnd();
187: return(0);
188: }
190: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
191: {
193: Mat_MPIAIJ *mpiaij;
196: mpiaij = (Mat_MPIAIJ*)A->data;
197: MatAssemblyEnd_MPIAIJ(A,mode);
198: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
199: VecSetType(mpiaij->lvec,VECSEQCUDA);
200: }
201: return(0);
202: }
204: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205: {
206: PetscErrorCode ierr;
207: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
208: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
209: cudaError_t err;
210: cusparseStatus_t stat;
213: try {
214: MatCUSPARSEClearHandle(a->A);
215: MatCUSPARSEClearHandle(a->B);
216: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
217: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
218: delete cusparseStruct;
219: } catch(char *ex) {
220: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
221: }
222: cusparseStruct = 0;
224: MatDestroy_MPIAIJ(A);
225: return(0);
226: }
228: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
229: {
230: PetscErrorCode ierr;
231: Mat_MPIAIJ *a;
232: Mat_MPIAIJCUSPARSE * cusparseStruct;
233: cudaError_t err;
234: cusparseStatus_t stat;
237: MatCreate_MPIAIJ(A);
238: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
239: a = (Mat_MPIAIJ*)A->data;
240: a->spptr = new Mat_MPIAIJCUSPARSE;
242: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
243: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
244: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
245: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
246: err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
248: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
249: A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE;
250: A->ops->mult = MatMult_MPIAIJCUSPARSE;
251: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
252: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
253: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
255: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
256: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
257: return(0);
258: }
260: /*@
261: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
262: (the default parallel PETSc format). This matrix will ultimately pushed down
263: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
264: assembly performance the user should preallocate the matrix storage by setting
265: the parameter nz (or the array nnz). By setting these parameters accurately,
266: performance during matrix assembly can be increased by more than a factor of 50.
268: Collective on MPI_Comm
270: Input Parameters:
271: + comm - MPI communicator, set to PETSC_COMM_SELF
272: . m - number of rows
273: . n - number of columns
274: . nz - number of nonzeros per row (same for all rows)
275: - nnz - array containing the number of nonzeros in the various rows
276: (possibly different for each row) or NULL
278: Output Parameter:
279: . A - the matrix
281: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282: MatXXXXSetPreallocation() paradigm instead of this routine directly.
283: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
285: Notes:
286: If nnz is given then nz is ignored
288: The AIJ format (also called the Yale sparse matrix format or
289: compressed row storage), is fully compatible with standard Fortran 77
290: storage. That is, the stored row and column indices can begin at
291: either one (as in Fortran) or zero. See the users' manual for details.
293: Specify the preallocated storage with either nz or nnz (not both).
294: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295: allocation. For large problems you MUST preallocate memory or you
296: will get TERRIBLE performance, see the users' manual chapter on matrices.
298: By default, this format uses inodes (identical nodes) when possible, to
299: improve numerical efficiency of matrix-vector products and solves. We
300: search for consecutive rows with the same nonzero structure, thereby
301: reusing matrix information to achieve increased efficiency.
303: Level: intermediate
305: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306: @*/
307: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308: {
310: PetscMPIInt size;
313: MatCreate(comm,A);
314: MatSetSizes(*A,m,n,M,N);
315: MPI_Comm_size(comm,&size);
316: if (size > 1) {
317: MatSetType(*A,MATMPIAIJCUSPARSE);
318: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
319: } else {
320: MatSetType(*A,MATSEQAIJCUSPARSE);
321: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
322: }
323: return(0);
324: }
326: /*MC
327: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
329: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
330: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
331: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
333: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
334: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
335: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
336: for communicators controlling multiple processes. It is recommended that you call both of
337: the above preallocation routines for simplicity.
339: Options Database Keys:
340: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
341: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
343: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
345: Level: beginner
347: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
348: M
349: M*/