Actual source code: mpiaijcusp.cu
petsc-3.6.1 2015-08-06
1: #define PETSC_SKIP_COMPLEX
3: #include <petscconf.h>
4: PETSC_CUDA_EXTERN_C_BEGIN
5: #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
6: PETSC_CUDA_EXTERN_C_END
7: #include <../src/mat/impls/aij/mpi/mpicusp/mpicuspmatimpl.h>
11: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSP(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
12: {
13: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
14: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)b->spptr;
16: PetscInt i;
19: PetscLayoutSetUp(B->rmap);
20: PetscLayoutSetUp(B->cmap);
21: if (d_nnz) {
22: for (i=0; i<B->rmap->n; i++) {
23: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
24: }
25: }
26: if (o_nnz) {
27: for (i=0; i<B->rmap->n; i++) {
28: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
29: }
30: }
31: if (!B->preallocated) {
32: /* Explicitly create 2 MATSEQAIJCUSP matrices. */
33: MatCreate(PETSC_COMM_SELF,&b->A);
34: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
35: MatSetType(b->A,MATSEQAIJCUSP);
36: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
37: MatCreate(PETSC_COMM_SELF,&b->B);
38: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
39: MatSetType(b->B,MATSEQAIJCUSP);
40: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
41: }
42: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
43: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
44: MatCUSPSetFormat(b->A,MAT_CUSP_MULT,cuspStruct->diagGPUMatFormat);
45: MatCUSPSetFormat(b->B,MAT_CUSP_MULT,cuspStruct->offdiagGPUMatFormat);
46: MatCUSPSetStream(b->A,cuspStruct->stream);
47: MatCUSPSetStream(b->B,cuspStruct->stream);
48: B->preallocated = PETSC_TRUE;
49: return(0);
50: }
54: PetscErrorCode MatCreateVecs_MPIAIJCUSP(Mat mat,Vec *right,Vec *left)
55: {
57: PetscInt rbs,cbs;
60: MatGetBlockSizes(mat,&rbs,&cbs);
61: if (right) {
62: VecCreate(PetscObjectComm((PetscObject)mat),right);
63: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
64: VecSetBlockSize(*right,cbs);
65: VecSetType(*right,VECCUSP);
66: VecSetLayout(*right,mat->cmap);
67: }
68: if (left) {
69: VecCreate(PetscObjectComm((PetscObject)mat),left);
70: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
71: VecSetBlockSize(*left,rbs);
72: VecSetType(*left,VECCUSP);
73: VecSetLayout(*left,mat->rmap);
74: }
75: return(0);
76: }
81: PetscErrorCode MatMult_MPIAIJCUSP(Mat A,Vec xx,Vec yy)
82: {
83: /* This multiplication sequence is different sequence
84: than the CPU version. In particular, the diagonal block
85: multiplication kernel is launched in one stream. Then,
86: in a separate stream, the data transfers from DeviceToHost
87: (with MPI messaging in between), then HostToDevice are
88: launched. Once the data transfer stream is synchronized,
89: to ensure messaging is complete, the MatMultAdd kernel
90: is launched in the original (MatMult) stream to protect
91: against race conditions.
93: This sequence should only be called for GPU computation. */
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
96: PetscInt nt;
99: VecGetLocalSize(xx,&nt);
100: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
101: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
102: (*a->A->ops->mult)(a->A,xx,yy);
103: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
104: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
105: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
106: VecScatterFinalizeForGPU(a->Mvctx);
107: return(0);
108: }
110: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
114: PetscErrorCode MatCUSPSetFormat_MPIAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format)
115: {
116: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
117: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
120: switch (op) {
121: case MAT_CUSP_MULT_DIAG:
122: cuspStruct->diagGPUMatFormat = format;
123: break;
124: case MAT_CUSP_MULT_OFFDIAG:
125: cuspStruct->offdiagGPUMatFormat = format;
126: break;
127: case MAT_CUSP_ALL:
128: cuspStruct->diagGPUMatFormat = format;
129: cuspStruct->offdiagGPUMatFormat = format;
130: break;
131: default:
132: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_DIAG, and MAT_CUSP_MULT_ALL are currently supported.",op);
133: }
134: return(0);
135: }
139: PetscErrorCode MatSetFromOptions_MPIAIJCUSP(PetscOptions *PetscOptionsObject,Mat A)
140: {
141: MatCUSPStorageFormat format;
142: PetscErrorCode ierr;
143: PetscBool flg;
144: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
145: Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
148: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSP options");
149: PetscObjectOptionsBegin((PetscObject)A);
150: if (A->factortype==MAT_FACTOR_NONE) {
151: PetscOptionsEnum("-mat_cusp_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusp gpu matrices for SpMV",
152: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
153: if (flg) {
154: MatCUSPSetFormat(A,MAT_CUSP_MULT_DIAG,format);
155: }
156: PetscOptionsEnum("-mat_cusp_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
157: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
158: if (flg) {
159: MatCUSPSetFormat(A,MAT_CUSP_MULT_OFFDIAG,format);
160: }
161: PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
162: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
163: if (flg) {
164: MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
165: }
166: }
167: PetscOptionsEnd();
168: return(0);
169: }
173: PetscErrorCode MatDestroy_MPIAIJCUSP(Mat A)
174: {
176: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
177: Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
178: cudaError_t err=cudaSuccess;
181: try {
182: err = cudaStreamDestroy(cuspStruct->stream);
183: if (err!=cudaSuccess)
184: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
185: delete cuspStruct;
186: } catch(char *ex) {
187: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", ex);
188: }
189: cuspStruct = 0;
190: MatDestroy_MPIAIJ(A);
191: return(0);
192: }
196: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSP(Mat A)
197: {
199: Mat_MPIAIJ *a;
200: Mat_MPIAIJCUSP * cuspStruct;
201: cudaError_t err=cudaSuccess;
204: MatCreate_MPIAIJ(A);
205: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSP);
206: A->ops->getvecs = MatCreateVecs_MPIAIJCUSP;
207: A->ops->setvaluesbatch = MatSetValuesBatch_MPIAIJCUSP;
209: a = (Mat_MPIAIJ*)A->data;
210: a->spptr = new Mat_MPIAIJCUSP;
211: cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
213: cuspStruct->diagGPUMatFormat = MAT_CUSP_CSR;
214: cuspStruct->offdiagGPUMatFormat = MAT_CUSP_CSR;
215: err = cudaStreamCreate(&(cuspStruct->stream));
216: if (err!=cudaSuccess)
217: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
219: A->ops->mult = MatMult_MPIAIJCUSP;
220: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSP;
221: A->ops->destroy = MatDestroy_MPIAIJCUSP;
223: PetscObjectComposeFunction((PetscObject)A,"MatCUSPSetFormat_C", MatCUSPSetFormat_MPIAIJCUSP);
224: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSP);
225: return(0);
226: }
229: /*@
230: MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
231: (the default parallel PETSc format). This matrix will ultimately pushed down
232: to NVidia GPUs and use the CUSP library for calculations. For good matrix
233: assembly performance the user should preallocate the matrix storage by setting
234: the parameter nz (or the array nnz). By setting these parameters accurately,
235: performance during matrix assembly can be increased by more than a factor of 50.
238: Collective on MPI_Comm
240: Input Parameters:
241: + comm - MPI communicator, set to PETSC_COMM_SELF
242: . m - number of rows
243: . n - number of columns
244: . nz - number of nonzeros per row (same for all rows)
245: - nnz - array containing the number of nonzeros in the various rows
246: (possibly different for each row) or NULL
248: Output Parameter:
249: . A - the matrix
251: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
252: MatXXXXSetPreallocation() paradigm instead of this routine directly.
253: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
255: Notes:
256: If nnz is given then nz is ignored
258: The AIJ format (also called the Yale sparse matrix format or
259: compressed row storage), is fully compatible with standard Fortran 77
260: storage. That is, the stored row and column indices can begin at
261: either one (as in Fortran) or zero. See the users' manual for details.
263: Specify the preallocated storage with either nz or nnz (not both).
264: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
265: allocation. For large problems you MUST preallocate memory or you
266: will get TERRIBLE performance, see the users' manual chapter on matrices.
268: By default, this format uses inodes (identical nodes) when possible, to
269: improve numerical efficiency of matrix-vector products and solves. We
270: search for consecutive rows with the same nonzero structure, thereby
271: reusing matrix information to achieve increased efficiency.
273: Level: intermediate
275: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSP, MATAIJCUSP
276: @*/
279: PetscErrorCode MatCreateAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
280: {
282: PetscMPIInt size;
285: MatCreate(comm,A);
286: MatSetSizes(*A,m,n,M,N);
287: MPI_Comm_size(comm,&size);
288: if (size > 1) {
289: MatSetType(*A,MATMPIAIJCUSP);
290: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
291: } else {
292: MatSetType(*A,MATSEQAIJCUSP);
293: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
294: }
295: return(0);
296: }
298: /*M
299: MATAIJCUSP - MATMPIAIJCUSP= "aijcusp" = "mpiaijcusp" - A matrix type to be used for sparse matrices.
301: A matrix type type whose data resides on Nvidia GPUs. These matrices can be CSR format.
302: All matrix calculations are performed using the CUSP library. DIA and ELL
303: formats are also available
305: This matrix type is identical to MATSEQAIJCUSP when constructed with a single process communicator,
306: and MATMPIAIJCUSP otherwise. As a result, for single process communicators,
307: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
308: for communicators controlling multiple processes. It is recommended that you call both of
309: the above preallocation routines for simplicity.
311: Options Database Keys:
312: + -mat_type mpiaijcusp - sets the matrix type to "mpiaijcusp" during a call to MatSetFromOptions()
313: . -mat_cusp_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
314: . -mat_cusp_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
315: - -mat_cusp_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
317: Level: beginner
319: .seealso: MatCreateAIJCUSP(), MATSEQAIJCUSP, MatCreateSeqAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation
320: M*/