Actual source code: mpiaijcusparse.cu
petsc-3.14.6 2021-03-30
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_CXX_COMPLEX_FIX
3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
5: #include <petscconf.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
10: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11: {
12: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
13: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14: PetscErrorCode ierr;
15: PetscInt i;
18: PetscLayoutSetUp(B->rmap);
19: PetscLayoutSetUp(B->cmap);
20: if (d_nnz) {
21: for (i=0; i<B->rmap->n; i++) {
22: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23: }
24: }
25: if (o_nnz) {
26: for (i=0; i<B->rmap->n; i++) {
27: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28: }
29: }
30: if (!B->preallocated) {
31: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32: MatCreate(PETSC_COMM_SELF,&b->A);
33: MatBindToCPU(b->A,B->boundtocpu);
34: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
35: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
36: MatCreate(PETSC_COMM_SELF,&b->B);
37: MatBindToCPU(b->B,B->boundtocpu);
38: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
39: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
40: }
41: MatSetType(b->A,MATSEQAIJCUSPARSE);
42: MatSetType(b->B,MATSEQAIJCUSPARSE);
43: if (b->lvec) {
44: VecSetType(b->lvec,VECSEQCUDA);
45: }
46: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
47: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
48: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
49: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
50: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
51: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
52: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
53: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
55: B->preallocated = PETSC_TRUE;
56: return(0);
57: }
59: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
60: {
61: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
63: PetscInt nt;
66: VecGetLocalSize(xx,&nt);
67: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
68: VecScatterInitializeForGPU(a->Mvctx,xx);
69: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
70: (*a->A->ops->mult)(a->A,xx,yy);
71: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
72: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
73: VecScatterFinalizeForGPU(a->Mvctx);
74: return(0);
75: }
77: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
78: {
79: Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
83: if (A->factortype == MAT_FACTOR_NONE) {
84: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
85: PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
86: if (d_mat) {
87: Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data;
88: Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data;
89: PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
90: cudaError_t err;
91: PetscScalar *vals;
92: PetscInfo(A,"Zero device matrix diag and offfdiag\n");
93: err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
94: err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
95: err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
96: err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
97: }
98: }
99: MatZeroEntries(l->A);
100: MatZeroEntries(l->B);
102: return(0);
103: }
104: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
105: {
106: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
108: PetscInt nt;
111: VecGetLocalSize(xx,&nt);
112: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
113: VecScatterInitializeForGPU(a->Mvctx,xx);
114: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
115: (*a->A->ops->multadd)(a->A,xx,yy,zz);
116: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
117: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
118: VecScatterFinalizeForGPU(a->Mvctx);
119: return(0);
120: }
122: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123: {
124: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
126: PetscInt nt;
129: VecGetLocalSize(xx,&nt);
130: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131: VecScatterInitializeForGPU(a->Mvctx,a->lvec);
132: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
133: (*a->A->ops->multtranspose)(a->A,xx,yy);
134: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
135: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
136: VecScatterFinalizeForGPU(a->Mvctx);
137: return(0);
138: }
140: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141: {
142: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
143: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
146: switch (op) {
147: case MAT_CUSPARSE_MULT_DIAG:
148: cusparseStruct->diagGPUMatFormat = format;
149: break;
150: case MAT_CUSPARSE_MULT_OFFDIAG:
151: cusparseStruct->offdiagGPUMatFormat = format;
152: break;
153: case MAT_CUSPARSE_ALL:
154: cusparseStruct->diagGPUMatFormat = format;
155: cusparseStruct->offdiagGPUMatFormat = format;
156: break;
157: default:
158: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159: }
160: return(0);
161: }
163: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164: {
165: MatCUSPARSEStorageFormat format;
166: PetscErrorCode ierr;
167: PetscBool flg;
168: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
169: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
172: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
173: if (A->factortype==MAT_FACTOR_NONE) {
174: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
176: if (flg) {
177: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
178: }
179: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
181: if (flg) {
182: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
183: }
184: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
186: if (flg) {
187: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
188: }
189: }
190: PetscOptionsTail();
191: return(0);
192: }
194: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
195: {
196: PetscErrorCode ierr;
197: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
198: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
199: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
200: PetscInt nnz_state = A->nonzerostate;
202: if (d_mat) {
203: cudaError_t err;
204: err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
205: }
206: MatAssemblyEnd_MPIAIJ(A,mode);
207: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
208: VecSetType(mpiaij->lvec,VECSEQCUDA);
209: }
210: if (nnz_state > A->nonzerostate) {
211: A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
212: }
214: return(0);
215: }
217: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
218: {
219: PetscErrorCode ierr;
220: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
221: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
222: cudaError_t err;
223: cusparseStatus_t stat;
226: if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
227: if (cusparseStruct->deviceMat) {
228: Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data;
229: Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
230: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
231: PetscInfo(A,"Have device matrix\n");
232: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
233: if (jaca->compressedrow.use) {
234: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
235: }
236: if (jacb->compressedrow.use) {
237: err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
238: }
239: err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
240: err = cudaFree(d_mat);CHKERRCUDA(err);
241: }
242: try {
243: if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
244: if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
245: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
246: if (cusparseStruct->stream) {
247: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
248: }
249: delete cusparseStruct;
250: } catch(char *ex) {
251: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
252: }
253: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
254: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
255: MatDestroy_MPIAIJ(A);
256: return(0);
257: }
259: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
260: {
261: PetscErrorCode ierr;
262: Mat_MPIAIJ *a;
263: Mat_MPIAIJCUSPARSE *cusparseStruct;
264: cusparseStatus_t stat;
265: Mat A;
268: if (reuse == MAT_INITIAL_MATRIX) {
269: MatDuplicate(B,MAT_COPY_VALUES,newmat);
270: } else if (reuse == MAT_REUSE_MATRIX) {
271: MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
272: }
273: A = *newmat;
275: PetscFree(A->defaultvectype);
276: PetscStrallocpy(VECCUDA,&A->defaultvectype);
278: a = (Mat_MPIAIJ*)A->data;
279: if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
280: if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
281: if (a->lvec) {
282: VecSetType(a->lvec,VECSEQCUDA);
283: }
285: if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
286: a->spptr = new Mat_MPIAIJCUSPARSE;
288: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
289: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
290: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
291: cusparseStruct->stream = 0;
292: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
293: cusparseStruct->deviceMat = NULL;
294: }
296: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
297: A->ops->mult = MatMult_MPIAIJCUSPARSE;
298: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
299: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
300: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
301: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
302: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
304: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
305: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
306: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
307: return(0);
308: }
310: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
311: {
315: PetscCUDAInitializeCheck();
316: MatCreate_MPIAIJ(A);
317: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
318: return(0);
319: }
321: /*@
322: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
323: (the default parallel PETSc format). This matrix will ultimately pushed down
324: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
325: assembly performance the user should preallocate the matrix storage by setting
326: the parameter nz (or the array nnz). By setting these parameters accurately,
327: performance during matrix assembly can be increased by more than a factor of 50.
329: Collective
331: Input Parameters:
332: + comm - MPI communicator, set to PETSC_COMM_SELF
333: . m - number of rows
334: . n - number of columns
335: . nz - number of nonzeros per row (same for all rows)
336: - nnz - array containing the number of nonzeros in the various rows
337: (possibly different for each row) or NULL
339: Output Parameter:
340: . A - the matrix
342: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
343: MatXXXXSetPreallocation() paradigm instead of this routine directly.
344: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
346: Notes:
347: If nnz is given then nz is ignored
349: The AIJ format (also called the Yale sparse matrix format or
350: compressed row storage), is fully compatible with standard Fortran 77
351: storage. That is, the stored row and column indices can begin at
352: either one (as in Fortran) or zero. See the users' manual for details.
354: Specify the preallocated storage with either nz or nnz (not both).
355: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
356: allocation. For large problems you MUST preallocate memory or you
357: will get TERRIBLE performance, see the users' manual chapter on matrices.
359: By default, this format uses inodes (identical nodes) when possible, to
360: improve numerical efficiency of matrix-vector products and solves. We
361: search for consecutive rows with the same nonzero structure, thereby
362: reusing matrix information to achieve increased efficiency.
364: Level: intermediate
366: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
367: @*/
368: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
369: {
371: PetscMPIInt size;
374: MatCreate(comm,A);
375: MatSetSizes(*A,m,n,M,N);
376: MPI_Comm_size(comm,&size);
377: if (size > 1) {
378: MatSetType(*A,MATMPIAIJCUSPARSE);
379: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
380: } else {
381: MatSetType(*A,MATSEQAIJCUSPARSE);
382: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
383: }
384: return(0);
385: }
387: /*MC
388: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
390: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
391: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
392: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
394: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
395: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
396: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
397: for communicators controlling multiple processes. It is recommended that you call both of
398: the above preallocation routines for simplicity.
400: Options Database Keys:
401: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
402: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
403: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
404: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
406: Level: beginner
408: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
409: M
410: M*/
412: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
413: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
414: {
415: #if defined(PETSC_USE_CTABLE)
416: SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
417: #else
418: PetscSplitCSRDataStructure **p_d_mat;
419: PetscMPIInt size,rank;
420: MPI_Comm comm;
421: PetscErrorCode ierr;
422: int *ai,*bi,*aj,*bj;
423: PetscScalar *aa,*ba;
426: PetscObjectGetComm((PetscObject)A,&comm);
427: MPI_Comm_size(comm,&size);
428: MPI_Comm_rank(comm,&rank);
429: if (A->factortype == MAT_FACTOR_NONE) {
430: CsrMatrix *matrixA,*matrixB=NULL;
431: if (size == 1) {
432: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
433: p_d_mat = &cusparsestruct->deviceMat;
434: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
435: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
436: matrixA = (CsrMatrix*)matstruct->mat;
437: bi = bj = NULL; ba = NULL;
438: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
439: } else {
440: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
441: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
442: p_d_mat = &spptr->deviceMat;
443: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
444: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
445: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
446: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
447: if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
448: if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
449: matrixA = (CsrMatrix*)matstructA->mat;
450: matrixB = (CsrMatrix*)matstructB->mat;
451: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
452: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
453: ba = thrust::raw_pointer_cast(matrixB->values->data());
454: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
455: }
456: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
457: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
458: aa = thrust::raw_pointer_cast(matrixA->values->data());
459: } else {
460: *B = NULL;
461: return(0);
462: }
463: // act like MatSetValues because not called on host
464: if (A->assembled) {
465: if (A->was_assembled) {
466: PetscInfo(A,"Assemble more than once already\n");
467: }
468: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
469: } else {
470: SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
471: }
472: if (!*p_d_mat) {
473: cudaError_t err;
474: PetscSplitCSRDataStructure *d_mat, h_mat;
475: Mat_SeqAIJ *jaca;
476: PetscInt n = A->rmap->n, nnz;
477: // create and copy
478: PetscInfo(A,"Create device matrix\n");
479: err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
480: err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
481: *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
482: if (size == 1) {
483: jaca = (Mat_SeqAIJ*)A->data;
484: h_mat.nonzerostate = A->nonzerostate;
485: h_mat.rstart = 0; h_mat.rend = A->rmap->n;
486: h_mat.cstart = 0; h_mat.cend = A->cmap->n;
487: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
488: h_mat.offdiag.a = NULL;
489: h_mat.seq = PETSC_TRUE;
490: } else {
491: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
492: Mat_SeqAIJ *jacb;
493: h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
494: jaca = (Mat_SeqAIJ*)aij->A->data;
495: jacb = (Mat_SeqAIJ*)aij->B->data;
496: h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
497: if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
498: if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
499: // create colmap - this is ussually done (lazy) in MatSetValues
500: aij->donotstash = PETSC_TRUE;
501: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
502: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
503: PetscCalloc1(A->cmap->N+1,&aij->colmap);
504: aij->colmap[A->cmap->N] = -9;
505: PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));
506: {
507: PetscInt ii;
508: for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
509: }
510: if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
511: // allocate B copy data
512: h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
513: h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
514: nnz = jacb->i[n];
516: if (jacb->compressedrow.use) {
517: err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
518: err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
519: } else h_mat.offdiag.i = bi;
520: h_mat.offdiag.j = bj;
521: h_mat.offdiag.a = ba;
523: err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
524: err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
525: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
526: h_mat.offdiag.n = n;
527: }
528: // allocate A copy data
529: nnz = jaca->i[n];
530: h_mat.diag.n = n;
531: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
532: MPI_Comm_rank(comm,&h_mat.rank);
533: if (jaca->compressedrow.use) {
534: err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
535: err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
536: } else {
537: h_mat.diag.i = ai;
538: }
539: h_mat.diag.j = aj;
540: h_mat.diag.a = aa;
541: // copy pointers and metdata to device
542: err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
543: PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);
544: } else {
545: *B = *p_d_mat;
546: }
547: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
548: return(0);
549: #endif
550: }