Actual source code: mpiaijcusparse.cu
petsc-3.7.3 2016-08-01
1: #define PETSC_SKIP_SPINLOCK
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
10: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11: {
12: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
13: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14: PetscErrorCode ierr;
15: PetscInt i;
18: if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
19: if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
20: if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
21: if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
23: PetscLayoutSetUp(B->rmap);
24: PetscLayoutSetUp(B->cmap);
25: if (d_nnz) {
26: for (i=0; i<B->rmap->n; i++) {
27: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
28: }
29: }
30: if (o_nnz) {
31: for (i=0; i<B->rmap->n; i++) {
32: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
33: }
34: }
35: if (!B->preallocated) {
36: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
37: MatCreate(PETSC_COMM_SELF,&b->A);
38: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
39: MatSetType(b->A,MATSEQAIJCUSPARSE);
40: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
41: MatCreate(PETSC_COMM_SELF,&b->B);
42: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
43: MatSetType(b->B,MATSEQAIJCUSPARSE);
44: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
45: }
46: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
47: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
48: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
49: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
50: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
51: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
52: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
53: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
55: B->preallocated = PETSC_TRUE;
56: return(0);
57: }
61: PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
62: {
64: PetscInt rbs,cbs;
67: MatGetBlockSizes(mat,&rbs,&cbs);
68: if (right) {
69: VecCreate(PetscObjectComm((PetscObject)mat),right);
70: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
71: VecSetBlockSize(*right,cbs);
72: VecSetType(*right,VECCUDA);
73: VecSetLayout(*right,mat->cmap);
74: }
75: if (left) {
76: VecCreate(PetscObjectComm((PetscObject)mat),left);
77: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
78: VecSetBlockSize(*left,rbs);
79: VecSetType(*left,VECCUDA);
80: VecSetLayout(*left,mat->rmap);
83: }
84: return(0);
85: }
90: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
91: {
92: /* This multiplication sequence is different sequence
93: than the CPU version. In particular, the diagonal block
94: multiplication kernel is launched in one stream. Then,
95: in a separate stream, the data transfers from DeviceToHost
96: (with MPI messaging in between), then HostToDevice are
97: launched. Once the data transfer stream is synchronized,
98: to ensure messaging is complete, the MatMultAdd kernel
99: is launched in the original (MatMult) stream to protect
100: against race conditions.
102: This sequence should only be called for GPU computation. */
103: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
105: PetscInt nt;
108: VecGetLocalSize(xx,&nt);
109: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
111: (*a->A->ops->mult)(a->A,xx,yy);
112: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
113: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
114: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
115: VecScatterFinalizeForGPU(a->Mvctx);
116: return(0);
117: }
121: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
122: {
123: /* This multiplication sequence is different sequence
124: than the CPU version. In particular, the diagonal block
125: multiplication kernel is launched in one stream. Then,
126: in a separate stream, the data transfers from DeviceToHost
127: (with MPI messaging in between), then HostToDevice are
128: launched. Once the data transfer stream is synchronized,
129: to ensure messaging is complete, the MatMultAdd kernel
130: is launched in the original (MatMult) stream to protect
131: against race conditions.
133: This sequence should only be called for GPU computation. */
134: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
136: PetscInt nt;
139: VecGetLocalSize(xx,&nt);
140: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
141: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
142: (*a->A->ops->multtranspose)(a->A,xx,yy);
143: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
144: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
145: (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);
146: VecScatterFinalizeForGPU(a->Mvctx);
147: return(0);
148: }
152: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
153: {
154: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
155: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
158: switch (op) {
159: case MAT_CUSPARSE_MULT_DIAG:
160: cusparseStruct->diagGPUMatFormat = format;
161: break;
162: case MAT_CUSPARSE_MULT_OFFDIAG:
163: cusparseStruct->offdiagGPUMatFormat = format;
164: break;
165: case MAT_CUSPARSE_ALL:
166: cusparseStruct->diagGPUMatFormat = format;
167: cusparseStruct->offdiagGPUMatFormat = format;
168: break;
169: default:
170: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
171: }
172: return(0);
173: }
177: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
178: {
179: MatCUSPARSEStorageFormat format;
180: PetscErrorCode ierr;
181: PetscBool flg;
182: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
183: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
186: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
187: PetscObjectOptionsBegin((PetscObject)A);
188: if (A->factortype==MAT_FACTOR_NONE) {
189: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
190: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
191: if (flg) {
192: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
193: }
194: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
195: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
196: if (flg) {
197: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
198: }
199: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
200: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
201: if (flg) {
202: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
203: }
204: }
205: PetscOptionsEnd();
206: return(0);
207: }
211: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
212: {
213: PetscErrorCode ierr;
214: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
215: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
216: cudaError_t err;
217: cusparseStatus_t stat;
220: try {
221: MatCUSPARSEClearHandle(a->A);
222: MatCUSPARSEClearHandle(a->B);
223: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
224: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
225: delete cusparseStruct;
226: } catch(char *ex) {
227: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
228: }
229: cusparseStruct = 0;
231: MatDestroy_MPIAIJ(A);
232: return(0);
233: }
237: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
238: {
239: PetscErrorCode ierr;
240: Mat_MPIAIJ *a;
241: Mat_MPIAIJCUSPARSE * cusparseStruct;
242: cudaError_t err;
243: cusparseStatus_t stat;
246: MatCreate_MPIAIJ(A);
247: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
248: a = (Mat_MPIAIJ*)A->data;
249: a->spptr = new Mat_MPIAIJCUSPARSE;
251: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
252: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
253: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
254: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
255: err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
257: A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE;
258: A->ops->mult = MatMult_MPIAIJCUSPARSE;
259: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
260: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
261: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
263: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
264: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
265: return(0);
266: }
268: /*@
269: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
270: (the default parallel PETSc format). This matrix will ultimately pushed down
271: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
272: assembly performance the user should preallocate the matrix storage by setting
273: the parameter nz (or the array nnz). By setting these parameters accurately,
274: performance during matrix assembly can be increased by more than a factor of 50.
276: Collective on MPI_Comm
278: Input Parameters:
279: + comm - MPI communicator, set to PETSC_COMM_SELF
280: . m - number of rows
281: . n - number of columns
282: . nz - number of nonzeros per row (same for all rows)
283: - nnz - array containing the number of nonzeros in the various rows
284: (possibly different for each row) or NULL
286: Output Parameter:
287: . A - the matrix
289: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
290: MatXXXXSetPreallocation() paradigm instead of this routine directly.
291: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293: Notes:
294: If nnz is given then nz is ignored
296: The AIJ format (also called the Yale sparse matrix format or
297: compressed row storage), is fully compatible with standard Fortran 77
298: storage. That is, the stored row and column indices can begin at
299: either one (as in Fortran) or zero. See the users' manual for details.
301: Specify the preallocated storage with either nz or nnz (not both).
302: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
303: allocation. For large problems you MUST preallocate memory or you
304: will get TERRIBLE performance, see the users' manual chapter on matrices.
306: By default, this format uses inodes (identical nodes) when possible, to
307: improve numerical efficiency of matrix-vector products and solves. We
308: search for consecutive rows with the same nonzero structure, thereby
309: reusing matrix information to achieve increased efficiency.
311: Level: intermediate
313: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
314: @*/
317: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
318: {
320: PetscMPIInt size;
323: MatCreate(comm,A);
324: MatSetSizes(*A,m,n,M,N);
325: MPI_Comm_size(comm,&size);
326: if (size > 1) {
327: MatSetType(*A,MATMPIAIJCUSPARSE);
328: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
329: } else {
330: MatSetType(*A,MATSEQAIJCUSPARSE);
331: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
332: }
333: return(0);
334: }
336: /*M
337: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
339: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
340: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
341: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
343: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
344: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
345: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
346: for communicators controlling multiple processes. It is recommended that you call both of
347: the above preallocation routines for simplicity.
349: Options Database Keys:
350: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
351: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
352: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
353: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
355: Level: beginner
357: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
358: M
359: M*/