Actual source code: mpiaijcusparse.cu
petsc-3.6.1 2015-08-06
1: #define PETSC_SKIP_COMPLEX
3: #include <petscconf.h>
4: PETSC_CUDA_EXTERN_C_BEGIN
5: #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
6: PETSC_CUDA_EXTERN_C_END
7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
12: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
13: {
14: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
15: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
16: PetscErrorCode ierr;
17: PetscInt i;
20: if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
21: if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
22: if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
23: if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
25: PetscLayoutSetUp(B->rmap);
26: PetscLayoutSetUp(B->cmap);
27: if (d_nnz) {
28: for (i=0; i<B->rmap->n; i++) {
29: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
30: }
31: }
32: if (o_nnz) {
33: for (i=0; i<B->rmap->n; i++) {
34: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
35: }
36: }
37: if (!B->preallocated) {
38: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
39: MatCreate(PETSC_COMM_SELF,&b->A);
40: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
41: MatSetType(b->A,MATSEQAIJCUSPARSE);
42: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
43: MatCreate(PETSC_COMM_SELF,&b->B);
44: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
45: MatSetType(b->B,MATSEQAIJCUSPARSE);
46: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
47: }
48: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
49: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
50: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
51: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
52: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
53: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
54: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
55: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
57: B->preallocated = PETSC_TRUE;
58: return(0);
59: }
63: PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
64: {
66: PetscInt rbs,cbs;
69: MatGetBlockSizes(mat,&rbs,&cbs);
70: if (right) {
71: VecCreate(PetscObjectComm((PetscObject)mat),right);
72: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
73: VecSetBlockSize(*right,cbs);
74: VecSetType(*right,VECCUSP);
75: VecSetLayout(*right,mat->cmap);
76: }
77: if (left) {
78: VecCreate(PetscObjectComm((PetscObject)mat),left);
79: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
80: VecSetBlockSize(*left,rbs);
81: VecSetType(*left,VECCUSP);
82: VecSetLayout(*left,mat->rmap);
85: }
86: return(0);
87: }
92: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
93: {
94: /* This multiplication sequence is different sequence
95: than the CPU version. In particular, the diagonal block
96: multiplication kernel is launched in one stream. Then,
97: in a separate stream, the data transfers from DeviceToHost
98: (with MPI messaging in between), then HostToDevice are
99: launched. Once the data transfer stream is synchronized,
100: to ensure messaging is complete, the MatMultAdd kernel
101: is launched in the original (MatMult) stream to protect
102: against race conditions.
104: This sequence should only be called for GPU computation. */
105: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
107: PetscInt nt;
110: VecGetLocalSize(xx,&nt);
111: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
112: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
113: (*a->A->ops->mult)(a->A,xx,yy);
114: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
115: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
116: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
117: VecScatterFinalizeForGPU(a->Mvctx);
118: return(0);
119: }
123: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
124: {
125: /* This multiplication sequence is different sequence
126: than the CPU version. In particular, the diagonal block
127: multiplication kernel is launched in one stream. Then,
128: in a separate stream, the data transfers from DeviceToHost
129: (with MPI messaging in between), then HostToDevice are
130: launched. Once the data transfer stream is synchronized,
131: to ensure messaging is complete, the MatMultAdd kernel
132: is launched in the original (MatMult) stream to protect
133: against race conditions.
135: This sequence should only be called for GPU computation. */
136: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
138: PetscInt nt;
141: VecGetLocalSize(xx,&nt);
142: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
143: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
144: (*a->A->ops->multtranspose)(a->A,xx,yy);
145: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
146: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
147: (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);
148: VecScatterFinalizeForGPU(a->Mvctx);
149: return(0);
150: }
154: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155: {
156: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
157: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
160: switch (op) {
161: case MAT_CUSPARSE_MULT_DIAG:
162: cusparseStruct->diagGPUMatFormat = format;
163: break;
164: case MAT_CUSPARSE_MULT_OFFDIAG:
165: cusparseStruct->offdiagGPUMatFormat = format;
166: break;
167: case MAT_CUSPARSE_ALL:
168: cusparseStruct->diagGPUMatFormat = format;
169: cusparseStruct->offdiagGPUMatFormat = format;
170: break;
171: default:
172: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
173: }
174: return(0);
175: }
179: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
180: {
181: MatCUSPARSEStorageFormat format;
182: PetscErrorCode ierr;
183: PetscBool flg;
184: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
185: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
188: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
189: PetscObjectOptionsBegin((PetscObject)A);
190: if (A->factortype==MAT_FACTOR_NONE) {
191: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
192: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
193: if (flg) {
194: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
195: }
196: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
197: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
198: if (flg) {
199: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
200: }
201: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
202: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
203: if (flg) {
204: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
205: }
206: }
207: PetscOptionsEnd();
208: return(0);
209: }
213: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
214: {
215: PetscErrorCode ierr;
216: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
217: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
218: cudaError_t err;
219: cusparseStatus_t stat;
222: try {
223: MatCUSPARSEClearHandle(a->A);
224: MatCUSPARSEClearHandle(a->B);
225: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
226: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
227: delete cusparseStruct;
228: } catch(char *ex) {
229: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230: }
231: cusparseStruct = 0;
233: MatDestroy_MPIAIJ(A);
234: return(0);
235: }
239: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
240: {
241: PetscErrorCode ierr;
242: Mat_MPIAIJ *a;
243: Mat_MPIAIJCUSPARSE * cusparseStruct;
244: cudaError_t err;
245: cusparseStatus_t stat;
248: MatCreate_MPIAIJ(A);
249: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
250: a = (Mat_MPIAIJ*)A->data;
251: a->spptr = new Mat_MPIAIJCUSPARSE;
253: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
254: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
255: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
256: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
257: err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
259: A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE;
260: A->ops->mult = MatMult_MPIAIJCUSPARSE;
261: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
262: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
263: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
265: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
266: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
267: return(0);
268: }
270: /*@
271: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
272: (the default parallel PETSc format). This matrix will ultimately pushed down
273: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
274: assembly performance the user should preallocate the matrix storage by setting
275: the parameter nz (or the array nnz). By setting these parameters accurately,
276: performance during matrix assembly can be increased by more than a factor of 50.
278: Collective on MPI_Comm
280: Input Parameters:
281: + comm - MPI communicator, set to PETSC_COMM_SELF
282: . m - number of rows
283: . n - number of columns
284: . nz - number of nonzeros per row (same for all rows)
285: - nnz - array containing the number of nonzeros in the various rows
286: (possibly different for each row) or NULL
288: Output Parameter:
289: . A - the matrix
291: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
292: MatXXXXSetPreallocation() paradigm instead of this routine directly.
293: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
295: Notes:
296: If nnz is given then nz is ignored
298: The AIJ format (also called the Yale sparse matrix format or
299: compressed row storage), is fully compatible with standard Fortran 77
300: storage. That is, the stored row and column indices can begin at
301: either one (as in Fortran) or zero. See the users' manual for details.
303: Specify the preallocated storage with either nz or nnz (not both).
304: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
305: allocation. For large problems you MUST preallocate memory or you
306: will get TERRIBLE performance, see the users' manual chapter on matrices.
308: By default, this format uses inodes (identical nodes) when possible, to
309: improve numerical efficiency of matrix-vector products and solves. We
310: search for consecutive rows with the same nonzero structure, thereby
311: reusing matrix information to achieve increased efficiency.
313: Level: intermediate
315: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
316: @*/
319: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
320: {
322: PetscMPIInt size;
325: MatCreate(comm,A);
326: MatSetSizes(*A,m,n,M,N);
327: MPI_Comm_size(comm,&size);
328: if (size > 1) {
329: MatSetType(*A,MATMPIAIJCUSPARSE);
330: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
331: } else {
332: MatSetType(*A,MATSEQAIJCUSPARSE);
333: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
334: }
335: return(0);
336: }
338: /*M
339: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
341: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
342: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
343: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
345: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
346: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
347: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
348: for communicators controlling multiple processes. It is recommended that you call both of
349: the above preallocation routines for simplicity.
351: Options Database Keys:
352: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
353: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
354: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
355: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
357: Level: beginner
359: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
360: M
361: M*/